[llvm] 9806101 - [X86] X86FixupVectorConstantsPass - attempt to replace full width fp vector constant loads with broadcasts on AVX+ targets
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon May 29 08:30:59 PDT 2023
Author: Simon Pilgrim
Date: 2023-05-29T16:10:52+01:00
New Revision: 98061013e01207444cfd3980cde17b5e75764fbe
URL: https://github.com/llvm/llvm-project/commit/98061013e01207444cfd3980cde17b5e75764fbe
DIFF: https://github.com/llvm/llvm-project/commit/98061013e01207444cfd3980cde17b5e75764fbe.diff
LOG: [X86] X86FixupVectorConstantsPass - attempt to replace full width fp vector constant loads with broadcasts on AVX+ targets
lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space.
NOTE: SSE3 targets can use MOVDDUP but not all SSE era CPUs can perform this as cheaply as a vector load, we will need to add scheduler model checks if we want to pursue this.
Added:
Modified:
llvm/lib/Target/X86/X86FixupVectorConstants.cpp
llvm/test/CodeGen/X86/avx-basic.ll
llvm/test/CodeGen/X86/avx-vbroadcast.ll
llvm/test/CodeGen/X86/avx2-conversions.ll
llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
llvm/test/CodeGen/X86/avx2-vbroadcast.ll
llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/bitreverse.ll
llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
llvm/test/CodeGen/X86/cast-vsel.ll
llvm/test/CodeGen/X86/combine-and.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/combine-udiv.ll
llvm/test/CodeGen/X86/extractelement-load.ll
llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/fma_patterns.ll
llvm/test/CodeGen/X86/fma_patterns_wide.ll
llvm/test/CodeGen/X86/fminimum-fmaximum.ll
llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
llvm/test/CodeGen/X86/fp-round.ll
llvm/test/CodeGen/X86/insert-into-constant-vector.ll
llvm/test/CodeGen/X86/known-bits-vector.ll
llvm/test/CodeGen/X86/masked_store_trunc.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/memset-nonzero.ll
llvm/test/CodeGen/X86/merge-store-constants.ll
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/paddus.ll
llvm/test/CodeGen/X86/pr30290.ll
llvm/test/CodeGen/X86/pr32368.ll
llvm/test/CodeGen/X86/pr38639.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/recip-fastmath.ll
llvm/test/CodeGen/X86/recip-fastmath2.ll
llvm/test/CodeGen/X86/sadd_sat_vec.ll
llvm/test/CodeGen/X86/sat-add.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/test/CodeGen/X86/splat-const.ll
llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
llvm/test/CodeGen/X86/sqrt-fastmath.ll
llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
llvm/test/CodeGen/X86/sse2.ll
llvm/test/CodeGen/X86/sshl_sat_vec.ll
llvm/test/CodeGen/X86/ssub_sat_vec.ll
llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
llvm/test/CodeGen/X86/v8i1-masks.ll
llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
llvm/test/CodeGen/X86/vec_anyext.ll
llvm/test/CodeGen/X86/vec_fabs.ll
llvm/test/CodeGen/X86/vec_fp_to_int.ll
llvm/test/CodeGen/X86/vec_int_to_fp.ll
llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
llvm/test/CodeGen/X86/vector-fshl-256.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
llvm/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/test/CodeGen/X86/vector-trunc-math.ll
llvm/test/CodeGen/X86/vector-trunc-ssat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vector-trunc.ll
llvm/test/CodeGen/X86/vselect-avx.ll
llvm/test/CodeGen/X86/vselect-zero.ll
llvm/test/CodeGen/X86/win_cst_pool.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 3e683cb872531..03e474b9e2e18 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -229,7 +229,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
+ MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
+ bool HasDQI = ST->hasDQI();
auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
unsigned OpBcst64, unsigned OpBcst32,
@@ -262,6 +263,50 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
return false;
};
+ // Attempt to convert full width vector loads into broadcast loads.
+ switch (Opc) {
+ /* FP Loads */
+ case X86::MOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPDrm:
+ case X86::MOVUPSrm:
+ // TODO: SSE3 MOVDDUP Handling
+ return false;
+ case X86::VMOVAPDrm:
+ case X86::VMOVAPSrm:
+ case X86::VMOVUPDrm:
+ case X86::VMOVUPSrm:
+ return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
+ 1);
+ case X86::VMOVAPDYrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPDYrm:
+ case X86::VMOVUPSYrm:
+ return ConvertToBroadcast(0, X86::VBROADCASTF128, X86::VBROADCASTSDYrm,
+ X86::VBROADCASTSSYrm, 0, 0, 1);
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVUPSZ128rm:
+ return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
+ X86::VBROADCASTSSZ128rm, 0, 0, 1);
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVUPSZ256rm:
+ return ConvertToBroadcast(
+ 0, HasDQI ? X86::VBROADCASTF64X2Z128rm : X86::VBROADCASTF32X4Z256rm,
+ X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm, 0, 0, 1);
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVUPDZrm:
+ case X86::VMOVUPSZrm:
+ return ConvertToBroadcast(
+ HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm,
+ HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm,
+ X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1);
+ }
+
// Attempt to find a AVX512 mapping from a full width memory-fold instruction
// to a broadcast-fold instruction variant.
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) {
diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll
index d37d290e55a25..b47f424acc942 100644
--- a/llvm/test/CodeGen/X86/avx-basic.ll
+++ b/llvm/test/CodeGen/X86/avx-basic.ll
@@ -87,7 +87,7 @@ define <8 x i32> @VMOVZQI2PQI(ptr nocapture %aFOO) nounwind {
define <16 x float> @fneg(<16 x float> %a) nounwind {
; CHECK-LABEL: fneg:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index f17cbc31fe66a..54bce767f1fcc 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -300,12 +300,12 @@ entry:
define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: _e2:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X86-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X64-NEXT: retq
entry:
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll
index 0dd83eec50fcf..7b35e602cc0fa 100644
--- a/llvm/test/CodeGen/X86/avx2-conversions.ll
+++ b/llvm/test/CodeGen/X86/avx2-conversions.ll
@@ -16,7 +16,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
;
; X86-FAST-ALL-LABEL: trunc4:
; X86-FAST-ALL: # %bb.0:
-; X86-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; X86-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; X86-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; X86-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X86-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; X86-FAST-ALL-NEXT: vzeroupper
@@ -38,7 +39,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
;
; X64-FAST-ALL-LABEL: trunc4:
; X64-FAST-ALL: # %bb.0:
-; X64-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; X64-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; X64-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; X64-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; X64-FAST-ALL-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 734e56008e083..15e2c3890354f 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -72,30 +72,34 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_packsswb_fold() {
; X86-AVX-LABEL: test_x86_avx2_packsswb_fold:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
; X86-AVX512VL: # %bb.0:
-; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: test_x86_avx2_packsswb_fold:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
; X64-AVX512VL: # %bb.0:
-; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
ret <32 x i8> %res
@@ -121,30 +125,34 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_packuswb_fold() {
; X86-AVX-LABEL: test_x86_avx2_packuswb_fold:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
; X86-AVX512VL: # %bb.0:
-; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: test_x86_avx2_packuswb_fold:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
; X64-AVX512VL: # %bb.0:
-; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
ret <32 x i8> %res
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index 70d99e79e1e57..b7516d30df5f6 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -657,12 +657,12 @@ define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp {
define <8 x i8> @_e4(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: _e4:
; X86: ## %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
; X86-NEXT: retl
;
; X64-LABEL: _e4:
; X64: ## %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
; X64-NEXT: retq
%vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0
%vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index 474f6a9e1948e..34a205a7baa86 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -98,9 +98,10 @@ define dso_local i64 @caller_argv64i1() #0 {
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %edi
; X32-NEXT: subl $88, %esp
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2,1,2,1]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
+; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
; X32-NEXT: vmovups %zmm0, (%esp)
; X32-NEXT: movl $1, {{[0-9]+}}(%esp)
; X32-NEXT: movl $2, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 0086d05d1ef9c..cc0da34453eb5 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -3630,7 +3630,8 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u>
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,10,6,15,0,10,6,15]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
@@ -3648,7 +3649,8 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u>
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,10,6,15,0,10,6,15]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
@@ -3892,7 +3894,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp,
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3]
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,3,7,3]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
@@ -3902,7 +3905,8 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3]
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3]
+; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
@@ -3917,7 +3921,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3]
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [7,3,7,3]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 2203d82907930..bcae88259a92e 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -592,12 +592,12 @@ define <2 x i16> @fold_v2i16() {
;
; X86XOP-LABEL: fold_v2i16:
; X86XOP: # %bb.0:
-; X86XOP-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
+; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; X86XOP-NEXT: retl
;
; GFNI-LABEL: fold_v2i16:
; GFNI: # %bb.0:
-; GFNI-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
+; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; GFNI-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index c0dc8033710ed..94500997987c9 100644
--- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -296,7 +296,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
@@ -328,7 +328,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll
index 6b86b7f912ca1..2fd7b34eceec9 100644
--- a/llvm/test/CodeGen/X86/cast-vsel.ll
+++ b/llvm/test/CodeGen/X86/cast-vsel.ll
@@ -194,7 +194,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
; AVX1-LABEL: trunc:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
@@ -337,7 +337,7 @@ define dso_local void @example25() nounwind {
; AVX1-LABEL: example25:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB5_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 43c85fdc703bf..d223b75419ac4 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -325,21 +325,11 @@ define <2 x i64> @and_or_v2i64(<2 x i64> %a0) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,8]
; SSE-NEXT: retq
;
-; AVX1-LABEL: and_or_v2i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,8]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: and_or_v2i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,8]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: and_or_v2i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = [8,8]
-; AVX512-NEXT: # xmm0 = mem[0,0]
-; AVX512-NEXT: retq
+; AVX-LABEL: and_or_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [8,8]
+; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: retq
%1 = or <2 x i64> %a0, <i64 255, i64 255>
%2 = and <2 x i64> %1, <i64 8, i64 8>
ret <2 x i64> %2
@@ -351,20 +341,10 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3]
; SSE-NEXT: retq
;
-; AVX1-LABEL: and_or_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [3,3,3,3]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: and_or_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: and_or_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: and_or_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
+; AVX-NEXT: retq
%1 = or <4 x i32> %a0, <i32 15, i32 15, i32 15, i32 15>
%2 = and <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
ret <4 x i32> %2
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 0f5f28a857940..bcdcfdd714784 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -147,20 +147,10 @@ define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_vec_sdiv_dupe:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; AVX1-NEXT: retq
-;
-; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe:
-; AVX2ORLATER: # %bb.0:
-; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; AVX2ORLATER-NEXT: retq
-;
-; XOP-LABEL: combine_vec_sdiv_dupe:
-; XOP: # %bb.0:
-; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; XOP-NEXT: retq
+; AVX-LABEL: combine_vec_sdiv_dupe:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; AVX-NEXT: retq
%1 = sdiv <4 x i32> %x, %x
ret <4 x i32> %1
}
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index f4b13ee495ec0..e013d8cd33598 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -135,19 +135,14 @@ define <4 x i32> @combine_vec_udiv_dupe(<4 x i32> %x) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_vec_udiv_dupe:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_vec_udiv_dupe:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_vec_udiv_dupe:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_dupe:
; XOP: # %bb.0:
-; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
+; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
; XOP-NEXT: retq
%1 = udiv <4 x i32> %x, %x
ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 1e891b5330a3c..538b8ed10f25b 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -469,61 +469,33 @@ define i32 @main() nounwind {
; X64-SSSE3-NEXT: popq %rbp
; X64-SSSE3-NEXT: retq
;
-; X64-AVX1-LABEL: main:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rbp
-; X64-AVX1-NEXT: movq %rsp, %rbp
-; X64-AVX1-NEXT: andq $-32, %rsp
-; X64-AVX1-NEXT: subq $64, %rsp
-; X64-AVX1-NEXT: movq n1 at GOTPCREL(%rip), %rax
-; X64-AVX1-NEXT: vmovaps (%rax), %ymm0
-; X64-AVX1-NEXT: movl zero+4(%rip), %ecx
-; X64-AVX1-NEXT: movl zero+8(%rip), %eax
-; X64-AVX1-NEXT: vmovaps %ymm0, zero(%rip)
-; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
-; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; X64-AVX1-NEXT: vmovaps (%rsp), %ymm0
-; X64-AVX1-NEXT: vextractps $2, %xmm0, %esi
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %esi
-; X64-AVX1-NEXT: movl %eax, %esi
-; X64-AVX1-NEXT: vextractps $1, %xmm0, %edi
-; X64-AVX1-NEXT: movl %ecx, %eax
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %edi
-; X64-AVX1-NEXT: addl %esi, %eax
-; X64-AVX1-NEXT: movq %rbp, %rsp
-; X64-AVX1-NEXT: popq %rbp
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: main:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: pushq %rbp
-; X64-AVX2-NEXT: movq %rsp, %rbp
-; X64-AVX2-NEXT: andq $-32, %rsp
-; X64-AVX2-NEXT: subq $64, %rsp
-; X64-AVX2-NEXT: movq n1 at GOTPCREL(%rip), %rax
-; X64-AVX2-NEXT: vmovaps (%rax), %ymm0
-; X64-AVX2-NEXT: movl zero+4(%rip), %ecx
-; X64-AVX2-NEXT: movl zero+8(%rip), %eax
-; X64-AVX2-NEXT: vmovaps %ymm0, zero(%rip)
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
-; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
-; X64-AVX2-NEXT: vmovaps (%rsp), %ymm0
-; X64-AVX2-NEXT: vextractps $2, %xmm0, %esi
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %esi
-; X64-AVX2-NEXT: movl %eax, %esi
-; X64-AVX2-NEXT: vextractps $1, %xmm0, %edi
-; X64-AVX2-NEXT: movl %ecx, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %edi
-; X64-AVX2-NEXT: addl %esi, %eax
-; X64-AVX2-NEXT: movq %rbp, %rsp
-; X64-AVX2-NEXT: popq %rbp
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
+; X64-AVX-LABEL: main:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rbp
+; X64-AVX-NEXT: movq %rsp, %rbp
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $64, %rsp
+; X64-AVX-NEXT: movq n1 at GOTPCREL(%rip), %rax
+; X64-AVX-NEXT: vmovaps (%rax), %ymm0
+; X64-AVX-NEXT: movl zero+4(%rip), %ecx
+; X64-AVX-NEXT: movl zero+8(%rip), %eax
+; X64-AVX-NEXT: vmovaps %ymm0, zero(%rip)
+; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
+; X64-AVX-NEXT: vmovaps %ymm0, (%rsp)
+; X64-AVX-NEXT: vmovaps (%rsp), %ymm0
+; X64-AVX-NEXT: vextractps $2, %xmm0, %esi
+; X64-AVX-NEXT: xorl %edx, %edx
+; X64-AVX-NEXT: divl %esi
+; X64-AVX-NEXT: movl %eax, %esi
+; X64-AVX-NEXT: vextractps $1, %xmm0, %edi
+; X64-AVX-NEXT: movl %ecx, %eax
+; X64-AVX-NEXT: xorl %edx, %edx
+; X64-AVX-NEXT: divl %edi
+; X64-AVX-NEXT: addl %esi, %eax
+; X64-AVX-NEXT: movq %rbp, %rsp
+; X64-AVX-NEXT: popq %rbp
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
%stackptr = alloca <8 x i32>, align 32
%z = load <8 x i32>, ptr @zero, align 32
%t1 = load <8 x i32>, ptr @n1, align 32
diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
index 2a3c3e3c7f4f7..bb8ee2238a004 100644
--- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
+++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
@@ -189,7 +189,7 @@ define <4 x double> @negated_constant_v4f64_fadd(<4 x double> %a) {
define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x double> %b) {
; FMA3-LABEL: negated_constant_v4f64_2fma_undefs:
; FMA3: # %bb.0:
-; FMA3-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; FMA3-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; FMA3-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + mem
; FMA3-NEXT: vfmadd132pd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2
; FMA3-NEXT: vaddpd %ymm1, %ymm0, %ymm0
@@ -197,7 +197,7 @@ define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x do
;
; FMA4-LABEL: negated_constant_v4f64_2fma_undefs:
; FMA4: # %bb.0:
-; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + mem
; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2
; FMA4-NEXT: vaddpd %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
index e4cc8f23fd38e..a886a3c830340 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
@@ -160,7 +160,7 @@ entry:
define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; CHECK-LABEL: test_mm_fnmsub_ps:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorps %xmm3, %xmm0, %xmm4
; CHECK-NEXT: vxorps %xmm3, %xmm2, %xmm0
; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
@@ -175,7 +175,8 @@ entry:
define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test_mm_fnmsub_pd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; CHECK-NEXT: # xmm3 = mem[0,0]
; CHECK-NEXT: vxorpd %xmm3, %xmm0, %xmm4
; CHECK-NEXT: vxorpd %xmm3, %xmm2, %xmm0
; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
@@ -342,7 +343,7 @@ entry:
define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
; CHECK-LABEL: test_mm256_fnmsub_ps:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorps %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
@@ -357,7 +358,7 @@ entry:
define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
; CHECK-LABEL: test_mm256_fnmsub_pd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorpd %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vxorpd %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll
index bac41849a4108..aa99672b8fc6a 100644
--- a/llvm/test/CodeGen/X86/fma_patterns.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns.ll
@@ -791,14 +791,14 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x fl
define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
@@ -832,14 +832,14 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
@@ -873,14 +873,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
@@ -914,14 +914,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float
define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
@@ -955,14 +955,14 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y
define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
@@ -996,14 +996,14 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y
define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
@@ -1318,7 +1318,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
; FMA-INFS-LABEL: test_v4f32_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1
; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1326,7 +1326,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
;
; FMA4-INFS-LABEL: test_v4f32_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1
; FMA4-INFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
@@ -1367,7 +1367,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
; FMA-INFS-LABEL: test_v8f32_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3
; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1
; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1375,7 +1375,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
;
; FMA4-INFS-LABEL: test_v8f32_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3
; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1
; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1
@@ -1465,7 +1465,8 @@ define double @test_f64_interp(double %x, double %y, double %t) {
define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
; FMA-INFS-LABEL: test_v2f64_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: # xmm3 = mem[0,0]
; FMA-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1
; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1473,7 +1474,8 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
;
; FMA4-INFS-LABEL: test_v2f64_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: # xmm3 = mem[0,0]
; FMA4-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1
; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
@@ -1515,7 +1517,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
; FMA-INFS-LABEL: test_v4f64_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3
; FMA-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1523,7 +1525,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
;
; FMA4-INFS-LABEL: test_v4f64_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3
; FMA4-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index 9d190a18c4552..fe5ddca67470c 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -259,7 +259,7 @@ define <8 x double> @test_8f64_fmsub_load(ptr %a0, <8 x double> %a1, <8 x double
define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -268,7 +268,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
;
; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -305,7 +305,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -314,7 +314,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -351,7 +351,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -360,7 +360,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
;
; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -397,7 +397,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -406,7 +406,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -443,7 +443,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -452,7 +452,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -490,7 +490,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -499,7 +499,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -537,7 +537,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -546,7 +546,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -584,7 +584,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -593,7 +593,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -631,7 +631,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -640,7 +640,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -677,7 +677,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -686,7 +686,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -723,7 +723,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -732,7 +732,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -769,7 +769,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -778,7 +778,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0
@@ -819,7 +819,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
; FMA-INFS-LABEL: test_v16f32_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7
; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6
; FMA-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3
@@ -830,7 +830,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
;
; FMA4-INFS-LABEL: test_v16f32_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7
; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6
; FMA4-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3
@@ -878,7 +878,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
; FMA-INFS-LABEL: test_v8f64_interp:
; FMA-INFS: # %bb.0:
-; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7
; FMA-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6
; FMA-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3
@@ -889,7 +889,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
;
; FMA4-INFS-LABEL: test_v8f64_interp:
; FMA4-INFS: # %bb.0:
-; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7
; FMA4-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6
; FMA4-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3
@@ -1143,7 +1143,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
; FMA: # %bb.0:
; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
-; FMA-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA-NEXT: vxorpd %ymm2, %ymm0, %ymm0
; FMA-NEXT: vxorpd %ymm2, %ymm1, %ymm1
; FMA-NEXT: retq
@@ -1152,7 +1152,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
; FMA4: # %bb.0:
; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA4-NEXT: vxorpd %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vxorpd %ymm2, %ymm1, %ymm1
; FMA4-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index 9c2a7adf5431a..5bb5d1e9c17ec 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -1070,21 +1070,15 @@ define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) {
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; AVX1-LABEL: test_fmaximum_vector_signed_zero:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: test_fmaximum_vector_signed_zero:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: test_fmaximum_vector_signed_zero:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; X86-LABEL: test_fmaximum_vector_signed_zero:
; X86: # %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
%r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
@@ -1283,21 +1277,15 @@ define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) {
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; AVX1-LABEL: test_fmaximum_vector_signed_zero_first:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: test_fmaximum_vector_signed_zero_first:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: test_fmaximum_vector_signed_zero_first:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; X86-LABEL: test_fmaximum_vector_signed_zero_first:
; X86: # %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
%r = call <4 x float> @llvm.maximum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
index 3ff68ac329c99..3f8bd24c38049 100644
--- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -11,12 +11,14 @@
define <4 x i16> @test_sext_4i8_4i16() {
; X32-LABEL: test_sext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -29,12 +31,14 @@ define <4 x i16> @test_sext_4i8_4i16() {
define <4 x i16> @test_sext_4i8_4i16_undef() {
; X32-LABEL: test_sext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,0,65533,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,0,65533,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -207,12 +211,14 @@ define <8 x i32> @test_sext_8i8_8i32_undef() {
define <4 x i16> @test_zext_4i8_4i16() {
; X32-LABEL: test_zext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -261,12 +267,14 @@ define <4 x i64> @test_zext_4i8_4i64() {
define <4 x i16> @test_zext_4i8_4i16_undef() {
; X32-LABEL: test_zext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u>
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
+; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u>
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
+; X64-NEXT: # xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
diff --git a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
index 73c7dc1fae56f..41989122a01eb 100644
--- a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
@@ -7,7 +7,8 @@
define <4 x float> @test1() {
; CHECK-LABEL: test1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0]
+; CHECK-NEXT: # xmm0 = mem[0,0]
; CHECK-NEXT: ret{{[l|q]}}
%1 = trunc <4 x i3> <i3 -1, i3 -22, i3 7, i3 8> to <4 x i1>
%2 = sitofp <4 x i1> %1 to <4 x float>
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index e8f3f069d01b3..8efd5819a6d22 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -572,9 +572,9 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
;
; AVX1-LABEL: round_v16f32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
@@ -680,9 +680,9 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
;
; AVX1-LABEL: round_v8f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3
-; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 16f3b0a48f48b..364fd81eb1aa9 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -219,13 +219,15 @@ define <2 x double> @elt1_v2f64(double %x) {
;
; X86-AVX-LABEL: elt1_v2f64:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u>
+; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4.2E+1,4.2E+1]
+; X86-AVX-NEXT: # xmm0 = mem[0,0]
; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: elt1_v2f64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u>
+; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; X64-AVX-NEXT: # xmm1 = mem[0,0]
; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X64-AVX-NEXT: retq
%ins = insertelement <2 x double> <double 42.0, double 1.0>, double %x, i32 1
diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll
index ebcb45d8b0f43..2eef32eb61414 100644
--- a/llvm/test/CodeGen/X86/known-bits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-bits-vector.ll
@@ -156,12 +156,12 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X86-LABEL: knownbits_mask_or_shuffle_uitofp:
; X86: # %bb.0:
-; X86-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
; X86-NEXT: retl
;
; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
%2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -385,7 +385,7 @@ define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) n
; X86-LABEL: knownbits_mask_concat_uitofp:
; X86: # %bb.0:
; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X86-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071]
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X86-NEXT: vandps %xmm2, %xmm0, %xmm0
@@ -396,7 +396,7 @@ define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) n
; X64-LABEL: knownbits_mask_concat_uitofp:
; X64: # %bb.0:
; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
-; X64-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071]
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071]
; X64-NEXT: vandps %xmm2, %xmm1, %xmm1
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X64-NEXT: vandps %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index b756165172650..17548df343251 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -376,7 +376,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
@@ -764,7 +764,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
@@ -2221,7 +2221,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v16i32_v16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1
@@ -2897,7 +2897,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1
@@ -4879,7 +4879,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
;
; AVX1-LABEL: truncstore_v32i16_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index c15e73daceb9b..682e2002c075a 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -232,7 +232,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7
@@ -545,7 +546,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -1018,7 +1020,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -1393,7 +1396,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
@@ -1588,7 +1592,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
@@ -1869,7 +1874,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
@@ -2099,7 +2105,8 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
@@ -2115,7 +2122,8 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
+; AVX2-NEXT: # xmm2 = mem[0,0]
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
@@ -2234,7 +2242,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX-LABEL: truncstore_v2i64_v2i16:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535]
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -2381,7 +2390,8 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX-LABEL: truncstore_v2i64_v2i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [255,255]
+; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll
index 5a0c703ae2ea4..96ac8dff79530 100644
--- a/llvm/test/CodeGen/X86/memset-nonzero.ll
+++ b/llvm/test/CodeGen/X86/memset-nonzero.ll
@@ -28,7 +28,7 @@ define void @memset_16_nonzero_bytes(ptr %x) {
;
; AVX-LABEL: memset_16_nonzero_bytes:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %xmm0, (%rdi)
; AVX-NEXT: retq
%call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 16, i64 -1)
@@ -54,7 +54,7 @@ define void @memset_32_nonzero_bytes(ptr %x) {
;
; AVX-LABEL: memset_32_nonzero_bytes:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -87,7 +87,7 @@ define void @memset_64_nonzero_bytes(ptr %x) {
;
; AVX1-LABEL: memset_64_nonzero_bytes:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
@@ -95,7 +95,7 @@ define void @memset_64_nonzero_bytes(ptr %x) {
;
; AVX2-LABEL: memset_64_nonzero_bytes:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
; AVX2-NEXT: vmovups %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
@@ -110,7 +110,7 @@ define void @memset_64_nonzero_bytes(ptr %x) {
;
; AVX512BW-LABEL: memset_64_nonzero_bytes:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -156,7 +156,7 @@ define void @memset_128_nonzero_bytes(ptr %x) {
;
; AVX1-LABEL: memset_128_nonzero_bytes:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
@@ -166,7 +166,7 @@ define void @memset_128_nonzero_bytes(ptr %x) {
;
; AVX2-LABEL: memset_128_nonzero_bytes:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
; AVX2-NEXT: vmovups %ymm0, 64(%rdi)
; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
@@ -184,7 +184,7 @@ define void @memset_128_nonzero_bytes(ptr %x) {
;
; AVX512BW-LABEL: memset_128_nonzero_bytes:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi)
; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
@@ -223,7 +223,7 @@ define void @memset_256_nonzero_bytes(ptr %x) {
;
; AVX1-LABEL: memset_256_nonzero_bytes:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
@@ -237,7 +237,7 @@ define void @memset_256_nonzero_bytes(ptr %x) {
;
; AVX2-LABEL: memset_256_nonzero_bytes:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX2-NEXT: vmovups %ymm0, 224(%rdi)
; AVX2-NEXT: vmovups %ymm0, 192(%rdi)
; AVX2-NEXT: vmovups %ymm0, 160(%rdi)
@@ -261,7 +261,7 @@ define void @memset_256_nonzero_bytes(ptr %x) {
;
; AVX512BW-LABEL: memset_256_nonzero_bytes:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi)
; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi)
; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi)
diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll
index e7778c0aaf322..8030d5f08fa57 100644
--- a/llvm/test/CodeGen/X86/merge-store-constants.ll
+++ b/llvm/test/CodeGen/X86/merge-store-constants.ll
@@ -58,14 +58,14 @@ define void @big_nonzero_32_bytes_splat(ptr nocapture %a) {
; X32-LABEL: big_nonzero_32_bytes_splat:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
; X32-NEXT: vmovups %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: big_nonzero_32_bytes_splat:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
; X64-NEXT: vmovups %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index e17e9d3a6573e..fa36c15b6445a 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -546,7 +546,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind {
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm2
; AVX2-FAST-ALL-NEXT: vbroadcastsd %xmm1, %ymm3
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0
; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -1547,7 +1547,8 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <u,u,u,u,u,1,4,7>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
+; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
@@ -1773,7 +1774,8 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0
; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1
; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = <5,u,u,6,u,u,7,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,0,7,6,5,0,7,6]
+; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3
; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll
index d4f3d4b9d1401..40d6ec6fb3155 100644
--- a/llvm/test/CodeGen/X86/paddus.ll
+++ b/llvm/test/CodeGen/X86/paddus.ll
@@ -613,7 +613,7 @@ define <64 x i8> @test17(<64 x i8> %x) {
;
; AVX1-LABEL: test17:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3
; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
@@ -1421,7 +1421,7 @@ define <32 x i16> @test35(<32 x i16> %x) {
;
; AVX1-LABEL: test35:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3
; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
diff --git a/llvm/test/CodeGen/X86/pr30290.ll b/llvm/test/CodeGen/X86/pr30290.ll
index 74e553191331f..478cb142475da 100644
--- a/llvm/test/CodeGen/X86/pr30290.ll
+++ b/llvm/test/CodeGen/X86/pr30290.ll
@@ -20,7 +20,7 @@ define void @foo(ptr byval(%struct.face) nocapture align 8) local_unnamed_addr {
; CHECK: # %bb.0:
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/pr32368.ll b/llvm/test/CodeGen/X86/pr32368.ll
index c10bacea688aa..52cf6fb07d672 100644
--- a/llvm/test/CodeGen/X86/pr32368.ll
+++ b/llvm/test/CodeGen/X86/pr32368.ll
@@ -114,12 +114,12 @@ define <16 x float> @PR32368_512(<16 x float>) {
;
; AVX1-LABEL: PR32368_512:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vaddps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vaddps %ymm0, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll
index c4a085b0b32a8..15cc7581454aa 100644
--- a/llvm/test/CodeGen/X86/pr38639.ll
+++ b/llvm/test/CodeGen/X86/pr38639.ll
@@ -4,11 +4,12 @@
define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <u,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1>
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
+; CHECK-NEXT: # xmm2 = mem[0,0]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: retq
%1 = shufflevector <4 x double> %a, <4 x double> <double undef, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C>, <8 x i32> <i32 6, i32 5, i32 2, i32 3, i32 5, i32 1, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 55e2342e8b0e6..faeaef7b40a62 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -948,7 +948,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
@@ -1746,7 +1746,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -2803,7 +2804,8 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -3027,7 +3029,8 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index 2279212ac8771..7e9bbc5556424 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -333,53 +333,11 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-RECIP-LABEL: v4f32_no_estimate:
-; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: retq
-;
-; FMA-RECIP-LABEL: v4f32_no_estimate:
-; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; FMA-RECIP-NEXT: retq
-;
-; BDVER2-LABEL: v4f32_no_estimate:
-; BDVER2: # %bb.0:
-; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; BDVER2-NEXT: retq
-;
-; BTVER2-LABEL: v4f32_no_estimate:
-; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: retq
-;
-; SANDY-LABEL: v4f32_no_estimate:
-; SANDY: # %bb.0:
-; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: retq
-;
-; HASWELL-LABEL: v4f32_no_estimate:
-; HASWELL: # %bb.0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; HASWELL-NEXT: retq
-;
-; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
-; HASWELL-NO-FMA: # %bb.0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: retq
-;
-; AVX512-LABEL: v4f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v4f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -400,7 +358,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
@@ -422,7 +380,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; BTVER2-LABEL: v4f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
@@ -434,7 +392,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
@@ -585,7 +543,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -598,7 +556,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; FMA-RECIP-LABEL: v4f32_two_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
@@ -609,7 +567,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; BDVER2-LABEL: v4f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %xmm0, %xmm1
-; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
@@ -618,7 +576,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; BTVER2-LABEL: v4f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2
@@ -634,7 +592,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -694,53 +652,11 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX-RECIP-LABEL: v8f32_no_estimate:
-; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT: retq
-;
-; FMA-RECIP-LABEL: v8f32_no_estimate:
-; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; FMA-RECIP-NEXT: retq
-;
-; BDVER2-LABEL: v8f32_no_estimate:
-; BDVER2: # %bb.0:
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; BDVER2-NEXT: retq
-;
-; BTVER2-LABEL: v8f32_no_estimate:
-; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT: retq
-;
-; SANDY-LABEL: v8f32_no_estimate:
-; SANDY: # %bb.0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT: retq
-;
-; HASWELL-LABEL: v8f32_no_estimate:
-; HASWELL: # %bb.0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; HASWELL-NEXT: retq
-;
-; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
-; HASWELL-NO-FMA: # %bb.0:
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT: retq
-;
-; AVX512-LABEL: v8f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v8f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -768,7 +684,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -790,7 +706,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; BTVER2-LABEL: v8f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -802,7 +718,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -879,7 +795,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -892,7 +808,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; FMA-RECIP-LABEL: v8f32_two_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1
@@ -903,7 +819,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; BDVER2-LABEL: v8f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm1
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
@@ -912,7 +828,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; BTVER2-LABEL: v8f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
@@ -928,7 +844,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -996,35 +912,35 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
;
; AVX-RECIP-LABEL: v16f32_no_estimate:
; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v16f32_no_estimate:
; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v16f32_no_estimate:
; BDVER2: # %bb.0:
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v16f32_no_estimate:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v16f32_no_estimate:
; SANDY: # %bb.0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1
; SANDY-NEXT: retq
@@ -1089,7 +1005,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
@@ -1103,7 +1019,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; FMA-RECIP-LABEL: v16f32_one_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
@@ -1114,7 +1030,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; BDVER2-LABEL: v16f32_one_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vrcpps %ymm1, %ymm4
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
@@ -1124,7 +1040,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
;
; BTVER2-LABEL: v16f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vrcpps %ymm1, %ymm4
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -1141,7 +1057,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vrcpps %ymm1, %ymm4
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
@@ -1249,7 +1165,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
@@ -1271,7 +1187,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; FMA-RECIP-LABEL: v16f32_two_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
@@ -1288,7 +1204,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; BDVER2-LABEL: v16f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
@@ -1302,7 +1218,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
;
; BTVER2-LABEL: v16f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
@@ -1327,7 +1243,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 5bb08e6ce2846..2a5e46bba2c00 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -476,7 +476,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
@@ -504,7 +504,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; BTVER2-LABEL: v4f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
@@ -518,7 +518,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
@@ -595,7 +595,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -610,7 +610,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; FMA-RECIP-LABEL: v4f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
@@ -632,7 +632,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; BTVER2-LABEL: v4f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
@@ -650,7 +650,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1
@@ -838,7 +838,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -866,7 +866,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
;
; BTVER2-LABEL: v8f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -880,7 +880,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -972,7 +972,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -987,7 +987,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; FMA-RECIP-LABEL: v8f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
@@ -1009,7 +1009,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
;
; BTVER2-LABEL: v8f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
@@ -1027,7 +1027,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -1327,7 +1327,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
@@ -1345,7 +1345,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
@@ -1360,7 +1360,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; BDVER2-LABEL: v16f32_one_step_2_divs:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
; BDVER2-NEXT: vrcpps %ymm1, %ymm2
@@ -1374,7 +1374,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
;
; BTVER2-LABEL: v16f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
@@ -1395,7 +1395,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
@@ -1526,7 +1526,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
@@ -1552,7 +1552,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; FMA-RECIP-LABEL: v16f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
@@ -1572,7 +1572,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; BDVER2-LABEL: v16f32_two_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
@@ -1590,7 +1590,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
;
; BTVER2-LABEL: v16f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
@@ -1619,7 +1619,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 34eaec95e5ac3..cb89a6595ad3b 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1063,7 +1063,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
@@ -1197,7 +1197,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
@@ -1207,7 +1208,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: # xmm3 = mem[0,0]
; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
@@ -1217,7 +1219,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX512F-LABEL: v2i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: # xmm3 = mem[0,0]
; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0
@@ -1733,7 +1736,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX1-NEXT: vxorpd %ymm5, %ymm4, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm7, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index b421fa2408039..48a3155cea341 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -656,7 +656,8 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_min:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573]
+; AVX2-NEXT: # xmm1 = mem[0,0]
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765]
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index f2508cd22f2d4..b042ce13bd627 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -315,7 +315,8 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind {
;
; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi)
; AVX2-FAST-ALL-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/splat-const.ll b/llvm/test/CodeGen/X86/splat-const.ll
index b2b27347f234e..b9fd29658367f 100644
--- a/llvm/test/CodeGen/X86/splat-const.ll
+++ b/llvm/test/CodeGen/X86/splat-const.ll
@@ -38,7 +38,7 @@ define <4 x i32> @const_vector() {
;
; AVX-LABEL: const_vector:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
; AVX-NEXT: retq
;
; AVX2-LABEL: const_vector:
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 2a0b9285f3249..6d6a7b897c332 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -64,7 +64,7 @@ define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1
; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0
; SNB-NEXT: retq
@@ -152,7 +152,7 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1
; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0
; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0
; SNB-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 54ea207ac5dc6..1c1df175bdb6f 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -210,7 +210,7 @@ define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #3 {
; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -261,7 +261,7 @@ define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #6 {
; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -360,19 +360,12 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
; SSE-NEXT: divps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: v4f32_no_estimate:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vsqrtps %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: v4f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vsqrtps %xmm0, %xmm0
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v4f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtps %xmm0, %xmm0
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
ret <4 x float> %div
@@ -431,7 +424,7 @@ define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 {
; AVX1-NEXT: vrsqrtps %xmm0, %xmm1
; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -461,19 +454,12 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: divps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX1-LABEL: v8f32_no_estimate:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vsqrtps %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: v8f32_no_estimate:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vsqrtps %ymm0, %ymm0
-; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v8f32_no_estimate:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
ret <8 x float> %div
@@ -544,7 +530,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
; AVX1: # %bb.0:
; AVX1-NEXT: vsqrtps %ymm1, %ymm1
; AVX1-NEXT: vsqrtps %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
@@ -595,11 +581,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
; AVX1-LABEL: v16f32_estimate:
; AVX1: # %bb.0:
; AVX1-NEXT: vrsqrtps %ymm0, %ymm2
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vrsqrtps %ymm1, %ymm5
; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0
@@ -985,7 +971,8 @@ define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, ptr %p) nou
; AVX-LABEL: sqrt_simplify_before_recip_vec:
; AVX: # %bb.0:
; AVX-NEXT: vsqrtpd %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1
; AVX-NEXT: vmovupd %xmm1, (%rdi)
; AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index 95eb23fc3cd5d..b042f122541b0 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -521,20 +521,10 @@ define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind {
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_srem_one_eq:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_srem_one_eq:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_srem_one_eq:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_srem_one_eq:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX-NEXT: retq
%srem = srem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
@@ -677,20 +667,10 @@ define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind {
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_srem_allones:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_srem_allones:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_srem_allones:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_srem_allones:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX-NEXT: retq
%srem = srem <4 x i32> %X, <i32 4294967295, i32 4294967295, i32 4294967295, i32 4294967295>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index a651648a9d727..231f274db83cd 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -601,17 +601,11 @@ define fastcc void @test17() nounwind {
; X86-SSE-NEXT: movaps %xmm0, (%eax)
; X86-SSE-NEXT: retl
;
-; X86-AVX1-LABEL: test17:
-; X86-AVX1: # %bb.0: # %entry
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
-; X86-AVX1-NEXT: vmovaps %xmm0, (%eax)
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX512-LABEL: test17:
-; X86-AVX512: # %bb.0: # %entry
-; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
-; X86-AVX512-NEXT: vmovaps %xmm0, (%eax)
-; X86-AVX512-NEXT: retl
+; X86-AVX-LABEL: test17:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
+; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: test17:
; X64-SSE: # %bb.0: # %entry
@@ -619,17 +613,11 @@ define fastcc void @test17() nounwind {
; X64-SSE-NEXT: movaps %xmm0, (%rax)
; X64-SSE-NEXT: retq
;
-; X64-AVX1-LABEL: test17:
-; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
-; X64-AVX1-NEXT: vmovaps %xmm0, (%rax)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX512-LABEL: test17:
-; X64-AVX512: # %bb.0: # %entry
-; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
-; X64-AVX512-NEXT: vmovaps %xmm0, (%rax)
-; X64-AVX512-NEXT: retq
+; X64-AVX-LABEL: test17:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
+; X64-AVX-NEXT: vmovaps %xmm0, (%rax)
+; X64-AVX-NEXT: retq
entry:
%0 = insertelement <4 x i32> undef, i32 undef, i32 1
%1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -712,3 +700,8 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
%m = mul <4 x i32> %x, %y
ret <4 x i32> %m
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-AVX1: {{.*}}
+; X64-AVX512: {{.*}}
+; X86-AVX1: {{.*}}
+; X86-AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index bd9ee00d32e70..72a3e74ff0a7f 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -48,8 +48,10 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; X64-AVX2-LABEL: vec_v2i64:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807]
+; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: # xmm2 = mem[0,0]
+; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807]
+; X64-AVX2-NEXT: # xmm3 = mem[0,0]
; X64-AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm3
; X64-AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2
; X64-AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm4
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index c8fd7e89c605f..21f1fd6c8da21 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -1126,7 +1126,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
@@ -1292,7 +1292,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
@@ -1304,7 +1305,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: # xmm2 = mem[0,0]
; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
@@ -1316,7 +1318,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: # xmm2 = mem[0,0]
; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX512F-NEXT: retq
@@ -1959,7 +1962,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 8f770f98bc5ce..0b9a413d00b1d 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -443,20 +443,10 @@ define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind {
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX1-LABEL: test_urem_one_eq:
-; CHECK-AVX1: # %bb.0:
-; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX1-NEXT: retq
-;
-; CHECK-AVX2-LABEL: test_urem_one_eq:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-AVX512VL-LABEL: test_urem_one_eq:
-; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
-; CHECK-AVX512VL-NEXT: retq
+; CHECK-AVX-LABEL: test_urem_one_eq:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-AVX-NEXT: retq
%urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll
index 212d9764622de..c053acd17a1fd 100644
--- a/llvm/test/CodeGen/X86/v8i1-masks.ll
+++ b/llvm/test/CodeGen/X86/v8i1-masks.ll
@@ -240,7 +240,7 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) {
define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 {
; X86-LABEL: two_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-NEXT: vandps %ymm0, %ymm1, %ymm0
@@ -248,7 +248,7 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 {
;
; X64-LABEL: two_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
@@ -298,7 +298,7 @@ entry:
define <8 x i32> @three_ands(<8 x float> %x) {
; X86-LABEL: three_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -309,7 +309,7 @@ define <8 x i32> @three_ands(<8 x float> %x) {
;
; X64-LABEL: three_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -374,7 +374,7 @@ entry:
define <8 x i32> @four_ands(<8 x float> %x) {
; X86-LABEL: four_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -387,7 +387,7 @@ define <8 x i32> @four_ands(<8 x float> %x) {
;
; X64-LABEL: four_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -464,7 +464,7 @@ entry:
define <8 x i32> @five_ands(<8 x float> %x) {
; X86-LABEL: five_ands:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -479,7 +479,7 @@ define <8 x i32> @five_ands(<8 x float> %x) {
;
; X64-LABEL: five_ands:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -568,7 +568,7 @@ entry:
define <8 x i32> @two_or(<8 x float> %x) {
; X86-LABEL: two_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
@@ -576,7 +576,7 @@ define <8 x i32> @two_or(<8 x float> %x) {
;
; X64-LABEL: two_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
@@ -628,7 +628,7 @@ entry:
define <8 x i32> @three_or(<8 x float> %x) {
; X86-LABEL: three_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -639,7 +639,7 @@ define <8 x i32> @three_or(<8 x float> %x) {
;
; X64-LABEL: three_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -708,7 +708,7 @@ entry:
define <8 x i32> @four_or(<8 x float> %x) {
; X86-LABEL: four_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -721,7 +721,7 @@ define <8 x i32> @four_or(<8 x float> %x) {
;
; X64-LABEL: four_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -804,7 +804,7 @@ entry:
define <8 x i32> @five_or(<8 x float> %x) {
; X86-LABEL: five_or:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -819,7 +819,7 @@ define <8 x i32> @five_or(<8 x float> %x) {
;
; X64-LABEL: five_or:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -916,7 +916,7 @@ entry:
define <8 x i32> @three_or_and(<8 x float> %x) {
; X86-LABEL: three_or_and:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -927,7 +927,7 @@ define <8 x i32> @three_or_and(<8 x float> %x) {
;
; X64-LABEL: three_or_and:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -994,7 +994,7 @@ entry:
define <8 x i32> @four_or_and(<8 x float> %x) {
; X86-LABEL: four_or_and:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -1007,7 +1007,7 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
;
; X64-LABEL: four_or_and:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -1086,7 +1086,7 @@ entry:
define <8 x i32> @five_or_and(<8 x float> %x) {
; X86-LABEL: five_or_and:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -1101,7 +1101,7 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
;
; X64-LABEL: five_or_and:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -1194,7 +1194,7 @@ entry:
define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X86-LABEL: four_or_and_xor:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1
@@ -1207,7 +1207,7 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
;
; X64-LABEL: four_or_and_xor:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1
@@ -1288,7 +1288,7 @@ entry:
define <8 x i32> @five_or_and_xor(<8 x float> %x) {
; X86-LABEL: five_or_and_xor:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -1303,7 +1303,7 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
;
; X64-LABEL: five_or_and_xor:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -1397,7 +1397,7 @@ entry:
define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X86-LABEL: six_or_and_xor:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
@@ -1414,7 +1414,7 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
;
; X64-LABEL: six_or_and_xor:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index a11f13e606c30..349d94d930651 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -3063,10 +3063,10 @@ define <4 x i32> @strict_vector_fptoui_v4f32_to_v4i32(<4 x float> %a) #0 {
;
; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4
; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1
; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index d072072414f42..b28211bb4388f 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -1113,12 +1113,12 @@ define <4 x i32> @strict_vector_fptosi_v4f64_to_v4i32(<4 x double> %a) #0 {
define <4 x i32> @strict_vector_fptoui_v4f64_to_v4i32(<4 x double> %a) #0 {
; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2]
; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3
; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
@@ -1379,10 +1379,10 @@ define <8 x i32> @strict_vector_fptosi_v8f32_to_v8i32(<8 x float> %a) #0 {
define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 {
; AVX-LABEL: strict_vector_fptoui_v8f32_to_v8i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm4
; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1
; AVX-NEXT: vsubps %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index d5a08299a00a6..edba0caabc15f 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -211,7 +211,8 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
define <4 x i16> @const_16_32() nounwind {
; CHECK-LABEL: const_16_32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u>
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
+; CHECK-NEXT: # xmm0 = mem[0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i32> <i32 0, i32 3, i32 8, i32 7> to <4 x i16>
ret <4 x i16> %G
@@ -220,7 +221,8 @@ define <4 x i16> @const_16_32() nounwind {
define <4 x i16> @const_16_64() nounwind {
; CHECK-LABEL: const_16_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u>
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
+; CHECK-NEXT: # xmm0 = mem[0,0]
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i64> <i64 0, i64 3, i64 8, i64 7> to <4 x i16>
ret <4 x i16> %G
diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index fb01a18ea9280..982062d890754 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -149,7 +149,7 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
define <8 x double> @fabs_v8f64(<8 x double> %p) {
; X86-AVX-LABEL: fabs_v8f64:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X86-AVX-NEXT: retl
@@ -166,7 +166,7 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) {
;
; X64-AVX-LABEL: fabs_v8f64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64-AVX-NEXT: retq
@@ -188,7 +188,7 @@ declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
define <16 x float> @fabs_v16f32(<16 x float> %p) {
; X86-AVX-LABEL: fabs_v16f32:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X86-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X86-AVX-NEXT: retl
@@ -205,7 +205,7 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) {
;
; X64-AVX-LABEL: fabs_v16f32:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64-AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index 04609f02d333e..4f7a4676390f8 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1912,7 +1912,8 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() {
;
; AVX-LABEL: fptosi_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,1,4294967295,1]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: retq
%cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1970,7 +1971,8 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
;
; AVX-LABEL: fptoui_2f64_to_2i32_const:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u>
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [2,4,2,4]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: retq
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 54133eef2ef20..24e05bd937b0c 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4786,7 +4786,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
; AVX1: # %bb.0:
; AVX1-NEXT: vmovapd (%rdi), %ymm2
; AVX1-NEXT: vmovapd 32(%rdi), %ymm3
-; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1]
; AVX1-NEXT: vandpd %ymm4, %ymm3, %ymm5
; AVX1-NEXT: vmovaps (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
@@ -5640,7 +5640,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX1-NEXT: # xmm6 = mem[0,0]
; AVX1-NEXT: vsubpd %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
@@ -5649,7 +5650,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX1-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vsubpd %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vaddpd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovupd %xmm0, (%rdi)
@@ -5666,7 +5668,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT: # xmm6 = mem[0,0]
; AVX2-NEXT: vsubpd %xmm6, %xmm0, %xmm0
; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
@@ -5675,7 +5678,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX2-NEXT: vsubpd %xmm6, %xmm1, %xmm1
; AVX2-NEXT: vaddpd %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX2-NEXT: # xmm2 = mem[0,0]
; AVX2-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovupd %xmm0, (%rdi)
@@ -5692,7 +5696,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT: # xmm6 = mem[0,0]
; AVX512F-NEXT: vsubpd %xmm6, %xmm0, %xmm0
; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
@@ -5701,7 +5706,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX512F-NEXT: vsubpd %xmm6, %xmm1, %xmm1
; AVX512F-NEXT: vaddpd %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512F-NEXT: # xmm2 = mem[0,0]
; AVX512F-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vmovupd %xmm0, (%rdi)
@@ -5742,7 +5748,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
; AVX512DQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1
-; AVX512DQ-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512DQ-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1]
+; AVX512DQ-NEXT: # xmm2 = mem[0,0]
; AVX512DQ-NEXT: vaddpd %xmm2, %xmm0, %xmm0
; AVX512DQ-NEXT: vaddpd %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vmovupd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index be58e3958dea7..9a43d312f1322 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -431,7 +431,8 @@ define <2 x double> @constrained_vector_fmul_v2f64() #0 {
;
; AVX-LABEL: constrained_vector_fmul_v2f64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -493,7 +494,8 @@ define <3 x double> @constrained_vector_fmul_v3f64() #0 {
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
@@ -516,17 +518,11 @@ define <4 x double> @constrained_vector_fmul_v4f64() #0 {
; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
;
-; AVX1-LABEL: constrained_vector_fmul_v4f64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: constrained_vector_fmul_v4f64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: constrained_vector_fmul_v4f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
entry:
%mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64(
<4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -568,7 +564,8 @@ define <2 x double> @constrained_vector_fadd_v2f64() #0 {
;
; AVX-LABEL: constrained_vector_fadd_v2f64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -631,7 +628,8 @@ define <3 x double> @constrained_vector_fadd_v3f64() #0 {
; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
@@ -654,17 +652,11 @@ define <4 x double> @constrained_vector_fadd_v4f64() #0 {
; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
;
-; AVX1-LABEL: constrained_vector_fadd_v4f64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: constrained_vector_fadd_v4f64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: constrained_vector_fadd_v4f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
+; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
entry:
%add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
<4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -706,7 +698,8 @@ define <2 x double> @constrained_vector_fsub_v2f64() #0 {
;
; AVX-LABEL: constrained_vector_fsub_v2f64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: # xmm0 = mem[0,0]
; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -772,7 +765,8 @@ define <3 x double> @constrained_vector_fsub_v3f64() #0 {
; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
@@ -795,17 +789,11 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 {
; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: retq
;
-; AVX1-LABEL: constrained_vector_fsub_v4f64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
-; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX512-LABEL: constrained_vector_fsub_v4f64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
-; AVX512-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: constrained_vector_fsub_v4f64:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
+; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
entry:
%sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64(
<4 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
@@ -4460,11 +4448,11 @@ define <4 x i32> @constrained_vector_fptoui_v4i32_v4f32() #0 {
;
; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1]
; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vsubps %xmm0, %xmm1, %xmm0
@@ -5010,13 +4998,13 @@ define <4 x i32> @constrained_vector_fptoui_v4i32_v4f64() #0 {
;
; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1]
; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2]
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index adae44774b182..0500d6ec6e1f9 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -22,7 +22,7 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
@@ -123,7 +123,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 9b230ccefd3c8..4a580c8bacabe 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -22,7 +22,7 @@ declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
; AVX1-LABEL: var_funnnel_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
@@ -124,7 +124,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
; XOPAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 3e65c31cf83a1..a6e64e1d8f6d0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -328,7 +328,8 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <u,u,u,u,u,1,4,7>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
+; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 8c7a91013144e..8b1aae61ed5c3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -177,7 +177,8 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-ONLY-LABEL: load_i32_stride4_vf4:
; AVX2-ONLY: # %bb.0:
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm0 = mem[0,0]
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm0
; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2
@@ -187,16 +188,19 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi)
@@ -336,7 +340,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm5
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm5, %ymm6
; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7
; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm8
@@ -350,7 +355,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm11, %ymm11
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
@@ -358,7 +364,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm11
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm11, %ymm4
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
@@ -368,7 +375,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm5, %ymm0
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -673,7 +681,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm10
; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm9, %ymm12
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
@@ -695,7 +704,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm14
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
@@ -716,7 +726,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
@@ -732,7 +743,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
@@ -1379,7 +1391,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2
; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
@@ -1440,7 +1453,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -1494,7 +1508,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
@@ -1532,7 +1547,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -2855,7 +2871,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -2999,7 +3016,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -3118,7 +3136,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
@@ -3206,7 +3225,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
; AVX2-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm1 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm1 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 872fd8698cca1..d28215f89fdc1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -86,10 +86,12 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <4,2,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2]
+; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0
; AVX2-ONLY-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rdx)
@@ -114,11 +116,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,0,2,3]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-SLOW-NEXT: vmovaps {{.*#+}} xmm2 = <4,2,u,u>
+; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
+; AVX512F-SLOW-NEXT: # xmm2 = mem[0,0]
; AVX512F-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512F-SLOW-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512F-SLOW-NEXT: # xmm6 = mem[0,0]
; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx)
@@ -143,11 +147,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX512F-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512F-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512F-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx)
@@ -172,11 +178,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,0,2,3]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-SLOW-NEXT: vmovaps {{.*#+}} xmm2 = <4,2,u,u>
+; AVX512BW-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
+; AVX512BW-SLOW-NEXT: # xmm2 = mem[0,0]
; AVX512BW-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512BW-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512BW-SLOW-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512BW-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512BW-SLOW-NEXT: # xmm6 = mem[0,0]
; AVX512BW-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512BW-SLOW-NEXT: vmovq %xmm1, (%rdx)
@@ -201,11 +209,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u>
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX512BW-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512BW-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
-; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u>
+; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
+; AVX512BW-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rdx)
@@ -672,7 +682,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm11 = mem[0,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
@@ -683,7 +694,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm5 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
@@ -727,7 +739,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <u,u,u,4,2,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm9 = [2,4,2,4,2,4,2,4]
; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3]
@@ -757,7 +769,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm11 = mem[0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
@@ -768,7 +781,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
@@ -842,7 +856,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7]
@@ -853,7 +868,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
@@ -1529,7 +1545,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm5 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
@@ -1550,7 +1567,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm4 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
@@ -1704,7 +1722,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
@@ -1725,7 +1744,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm4 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
@@ -1880,7 +1900,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
@@ -1901,7 +1922,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7]
@@ -3455,7 +3477,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
@@ -3516,7 +3539,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm14 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
@@ -3863,7 +3887,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7]
@@ -3921,7 +3946,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm14 = mem[0,0]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
@@ -4270,7 +4296,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4
; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
@@ -4331,7 +4358,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7]
@@ -7520,7 +7548,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2]
+; AVX2-SLOW-NEXT: # xmm8 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7660,7 +7689,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
@@ -8382,7 +8412,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2]
+; AVX2-FAST-NEXT: # xmm8 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8524,7 +8555,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
@@ -9251,7 +9283,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2]
+; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -9391,7 +9424,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 011485f16168e..f9713d1eab16c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -100,7 +100,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = <4,3,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
+; AVX2-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
@@ -421,7 +422,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
+; AVX2-SLOW-NEXT: # xmm10 = mem[0,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -457,7 +459,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm2
; AVX2-FAST-NEXT: vbroadcastss 84(%rdi), %xmm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <1,0,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0]
+; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3
; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5
@@ -478,7 +481,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
+; AVX2-FAST-NEXT: # xmm10 = mem[0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -535,7 +539,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -4298,7 +4303,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3]
+; AVX2-SLOW-NEXT: # xmm3 = mem[0,0]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
@@ -5321,7 +5327,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
@@ -9202,7 +9209,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-SLOW-NEXT: # xmm5 = mem[0,0]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
@@ -10244,7 +10252,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7]
@@ -11293,7 +11302,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index fede3ba1ca14b..a4482bafbd535 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -136,7 +136,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0
; AVX2-FAST-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1
; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
@@ -306,7 +306,8 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index 0a28126d1b3ac..17bd3eb320104 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -244,7 +244,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,1,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 083e6e7f4b1de..5d478ae0f3e25 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -265,7 +265,8 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7
; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,4,1,5,0,4,1,5]
; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 6bbba6fc39143..ab3122960f53c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -139,7 +139,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,u,1>
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,0,2,4,u>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm2
@@ -439,7 +440,8 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-SLOW-NEXT: # xmm9 = mem[0,0]
; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm6
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1]
; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1]
@@ -476,14 +478,14 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = <u,1,5,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,1,5,1,5,1,5,1]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm8
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [5,0,2,6,5,0,2,6]
; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4,5,6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5],ymm8[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = <u,u,u,u,u,3,7,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [2,6,0,3,2,6,0,3]
; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1]
@@ -493,7 +495,8 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <u,u,0,4>
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [0,4,0,4]
+; AVX2-FAST-NEXT: # xmm5 = mem[0,0]
; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3
; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,4,0,1,0,4,0,1]
; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
@@ -504,7 +507,7 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-NEXT: vbroadcastss (%r10), %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <u,3,7,u,u,u,u,u>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
@@ -548,7 +551,8 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm6
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1]
; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index c20180523661e..69d8fa57cd482 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -263,7 +263,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm6, %ymm9
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm6
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = <u,u,0,4>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4]
+; AVX2-ONLY-NEXT: # xmm9 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -272,7 +273,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm10
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm9
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = <1,5,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,5,1,5]
+; AVX2-ONLY-NEXT: # xmm10 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
@@ -281,7 +283,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm11
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm10
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <u,u,2,6>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6]
+; AVX2-ONLY-NEXT: # xmm11 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm11, %ymm8
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3]
@@ -290,7 +293,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm7
; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7]
-; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u>
+; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7]
+; AVX2-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index c57f4d9cb59b2..320b63ee20bd5 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -255,7 +255,7 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) {
;
; AVX1-LABEL: test_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,1,1,1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1]
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
@@ -577,7 +577,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
;
; AVX1-SLOW-LABEL: test_v16i32_v16i8:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-SLOW-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -595,7 +595,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
;
; AVX1-FAST-LABEL: test_v16i32_v16i8:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-FAST-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-FAST-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -694,7 +694,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
;
; AVX1-SLOW-LABEL: test_v32i32_v32i8:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1
@@ -720,7 +720,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
;
; AVX1-FAST-LABEL: test_v32i32_v32i8:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
; AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1
@@ -1222,7 +1222,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
;
; AVX1-LABEL: test_v64i16_v64i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index e4cc9731c6105..5f8c06625a93c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -633,7 +633,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) {
;
; AVX1-LABEL: trunc_v16i32_v16i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index a3dd5bf3a1d42..7e7ba8b9ae65b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2211,7 +2211,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -2247,7 +2247,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 8a302e026b6b4..30b743cb7bdfd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -903,7 +903,8 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
;
; AVX2-FAST-ALL-LABEL: shuffle_v8f32_c348cda0:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,4,0,0,3,4,0]
+; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0]
; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
@@ -953,7 +954,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
;
; AVX2-FAST-ALL-LABEL: shuffle_v8f32_f511235a:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
+; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7,2,7,2,7,2,7,2]
; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 66d2ccff6d77f..90b5e70a0a302 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -300,7 +300,8 @@ define <8 x float> @expand15(<4 x float> %a) {
; AVX512-FAST-LABEL: expand15:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <u,u,0,u,1,u,u,u>
+; AVX512-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [1,0,0,0,1,0,0,0]
+; AVX512-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
; AVX512-FAST-NEXT: ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 0ad7b898e07e8..b34af730565e4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -501,7 +501,8 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0]
; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
-; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,0,11,0,u,u,u,u>
+; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,0,11,0,3,0,11,0]
+; X86-AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0]
; X86-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
@@ -568,7 +569,8 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1]
; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3
; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi)
-; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,11,u,u>
+; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,11,3,11]
+; X64-AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,8,9,3]
; X64-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index c87869e6c71f6..1b9648e77162e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2842,7 +2842,8 @@ define <4 x float> @PR30264(<4 x float> %x) {
;
; AVX-LABEL: PR30264:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.0E+0,1.0E+0,4.0E+0,1.0E+0]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
@@ -3471,37 +3472,21 @@ define void @SpinningCube() {
; SSE41-NEXT: movaps %xmm2, (%rax)
; SSE41-NEXT: retq
;
-; AVX1-LABEL: SpinningCube:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
-; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
-; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovaps %xmm2, (%rax)
-; AVX1-NEXT: vbroadcastss (%rax), %xmm2
-; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovaps %xmm0, (%rax)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: SpinningCube:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
-; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovaps %xmm2, (%rax)
-; AVX2-NEXT: vbroadcastss (%rax), %xmm2
-; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovaps %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX-LABEL: SpinningCube:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, (%rax)
+; AVX-NEXT: vbroadcastss (%rax), %xmm2
+; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rax)
+; AVX-NEXT: retq
entry:
store float 1.000000e+00, ptr undef, align 4
%0 = load float, ptr undef, align 4
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 39dc8662c7a4e..37b996bfe686a 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -528,7 +528,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -628,7 +628,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -696,7 +696,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -1276,7 +1276,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -1376,7 +1376,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -1444,7 +1444,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -1686,7 +1686,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX1-LABEL: trunc_mul_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -2202,7 +2202,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -2567,7 +2567,8 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -2617,7 +2618,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -2730,7 +2731,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -2806,7 +2807,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -2928,7 +2929,8 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <u,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -2973,7 +2975,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -3073,7 +3075,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -3141,7 +3143,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -3264,7 +3266,8 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -3314,7 +3317,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -3427,7 +3430,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -3503,7 +3506,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -3625,7 +3628,8 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -3670,7 +3674,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -3770,7 +3774,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -3838,7 +3842,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -3961,7 +3965,8 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -4011,7 +4016,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -4124,7 +4129,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -4200,7 +4205,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -4322,7 +4327,8 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -4367,7 +4373,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -4467,7 +4473,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
@@ -4535,7 +4541,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
;
; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 46ae1bdd8e654..f6e4377f64fa7 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -514,7 +514,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 51b013c63b70c..f687374baea4b 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -69,7 +69,8 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -166,7 +167,8 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i32_store:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -307,7 +309,8 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,429496729]
; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm3, %xmm1
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vzeroupper
@@ -334,7 +337,8 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
+; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-ALL-NEXT: vzeroupper
@@ -582,7 +586,8 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) {
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -728,7 +733,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -739,7 +745,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -750,7 +757,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -848,7 +856,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -860,7 +869,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -872,7 +882,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
+; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1021,7 +1032,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
@@ -1187,7 +1199,8 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535]
+; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
@@ -1447,7 +1460,8 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -1993,7 +2007,8 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -2093,7 +2108,8 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i8_store:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -2241,7 +2257,8 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
@@ -2411,7 +2428,8 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255]
+; AVX1-NEXT: # xmm4 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1
@@ -2663,7 +2681,8 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) {
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -2904,7 +2923,8 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) {
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
@@ -3295,7 +3315,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255]
+; AVX1-NEXT: # xmm7 = mem[0,0]
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 87cc6a86d7dd0..b5fa7312f7121 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -201,7 +201,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
;
; AVX1-LABEL: trunc8i64_8i16:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -279,7 +279,7 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
;
; AVX1-LABEL: trunc8i64_8i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -671,7 +671,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
;
; AVX1-LABEL: trunc16i32_16i16:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
@@ -897,7 +897,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
;
; AVX1-LABEL: trunc16i32_16i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
@@ -1295,7 +1295,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
;
; AVX1-LABEL: trunc32i16_32i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
@@ -1733,7 +1733,7 @@ define <32 x i8> @trunc2x16i16_32i8(<16 x i16> %a, <16 x i16> %b) {
;
; AVX1-LABEL: trunc2x16i16_32i8:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
@@ -2146,7 +2146,7 @@ define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, ptr %p) a
;
; AVX1-LABEL: store_merge_split:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index cf25020877255..367e0993e76ba 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -47,7 +47,7 @@ define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movq (%rdi,%rsi,8), %rax
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
; AVX1-NEXT: vmovupd %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll
index cc37f2c0c5484..1b576b28ce831 100644
--- a/llvm/test/CodeGen/X86/vselect-zero.ll
+++ b/llvm/test/CodeGen/X86/vselect-zero.ll
@@ -125,7 +125,8 @@ define double @fsel_nonzero_false_val(double %x, double %y, double %z) {
; AVX-LABEL: fsel_nonzero_false_val:
; AVX: # %bb.0:
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
;
@@ -179,7 +180,8 @@ define double @fsel_nonzero_constants(double %x, double %y) {
; AVX-LABEL: fsel_nonzero_constants:
; AVX: # %bb.0:
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/win_cst_pool.ll b/llvm/test/CodeGen/X86/win_cst_pool.ll
index 5bc9d34e4b35d..1fc05b26fddb5 100644
--- a/llvm/test/CodeGen/X86/win_cst_pool.ll
+++ b/llvm/test/CodeGen/X86/win_cst_pool.ll
@@ -65,16 +65,14 @@ define <8 x i16> @vec2() {
define <4 x float> @undef1() {
ret <4 x float> <float 1.0, float 1.0, float undef, float undef>
-; CHECK: .globl __xmm at 00000000000000003f8000003f800000
-; CHECK-NEXT: .section .rdata,"dr",discard,__xmm at 00000000000000003f8000003f800000
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: __xmm at 00000000000000003f8000003f800000:
-; CHECK-NEXT: .long 0x3f800000 # float 1
+; CHECK: .globl __real at 3f800000
+; CHECK-NEXT: .section .rdata,"dr",discard,__real at 3f800000
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: __real at 3f800000:
; CHECK-NEXT: .long 0x3f800000 # float 1
-; CHECK-NEXT: .zero 4
-; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .text
; CHECK: undef1:
-; CHECK: movaps __xmm at 00000000000000003f8000003f800000(%rip), %xmm0
+; CHECK: vbroadcastss __real at 3f800000(%rip), %xmm0
; CHECK-NEXT: ret
}
More information about the llvm-commits
mailing list