[llvm] d1592a9 - [X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions (#140028)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 28 03:02:41 PDT 2025
Author: Simon Pilgrim
Date: 2025-05-28T11:02:37+01:00
New Revision: d1592a966bc22b94362380aa690eeb92f42b8ca0
URL: https://github.com/llvm/llvm-project/commit/d1592a966bc22b94362380aa690eeb92f42b8ca0
DIFF: https://github.com/llvm/llvm-project/commit/d1592a966bc22b94362380aa690eeb92f42b8ca0.diff
LOG: [X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions (#140028)
When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower.
Throughput must not regress, but allow a small increase in latency based on how much constant data we're saving (I've used a simple estimate of 1 cycle per 128-bits of data saved).
NOTE: this currently ignores hoisted constant loads where the slower instruction might be acceptable.
Fixes #135998
Added:
Modified:
llvm/lib/Target/X86/X86FixupVectorConstants.cpp
llvm/test/CodeGen/X86/avgceils.ll
llvm/test/CodeGen/X86/avgfloors.ll
llvm/test/CodeGen/X86/avx512-build-vector.ll
llvm/test/CodeGen/X86/combine-or-shuffle.ll
llvm/test/CodeGen/X86/combine-or.ll
llvm/test/CodeGen/X86/constant-pool-sharing.ll
llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
llvm/test/CodeGen/X86/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/icmp-pow2-mask.ll
llvm/test/CodeGen/X86/kmov.ll
llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/pr30290.ll
llvm/test/CodeGen/X86/pr46532.ll
llvm/test/CodeGen/X86/pr57340.ll
llvm/test/CodeGen/X86/pr63108.ll
llvm/test/CodeGen/X86/pr77459.ll
llvm/test/CodeGen/X86/recip-fastmath.ll
llvm/test/CodeGen/X86/recip-fastmath2.ll
llvm/test/CodeGen/X86/slow-pmulld.ll
llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
llvm/test/CodeGen/X86/sqrt-fastmath.ll
llvm/test/CodeGen/X86/sse-domains.ll
llvm/test/CodeGen/X86/trunc-vector-width.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 11f6a6d4de2d1..6fe39c6789737 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -347,6 +347,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();
bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
+ bool OptSize = MF.getFunction().hasOptSize();
struct FixupEntry {
int Op;
@@ -355,6 +356,36 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
RebuildConstant;
};
+
+ auto NewOpcPreferable = [&](const FixupEntry &Fixup,
+ unsigned RegBitWidth) -> bool {
+ if (SM->hasInstrSchedModel()) {
+ unsigned NewOpc = Fixup.Op;
+ auto *OldDesc = SM->getSchedClassDesc(TII->get(Opc).getSchedClass());
+ auto *NewDesc = SM->getSchedClassDesc(TII->get(NewOpc).getSchedClass());
+ unsigned BitsSaved = RegBitWidth - (Fixup.NumCstElts * Fixup.MemBitWidth);
+
+ // Compare tput/lat - avoid any regressions, but allow extra cycle of
+ // latency in exchange for each 128-bit (or less) constant pool reduction
+ // (this is a very simple cost:benefit estimate - there will probably be
+ // better ways to calculate this).
+ double OldTput = MCSchedModel::getReciprocalThroughput(*ST, *OldDesc);
+ double NewTput = MCSchedModel::getReciprocalThroughput(*ST, *NewDesc);
+ if (OldTput != NewTput)
+ return NewTput < OldTput;
+
+ int LatTol = (BitsSaved + 127) / 128;
+ int OldLat = MCSchedModel::computeInstrLatency(*ST, *OldDesc);
+ int NewLat = MCSchedModel::computeInstrLatency(*ST, *NewDesc);
+ if (OldLat != NewLat)
+ return NewLat < (OldLat + LatTol);
+ }
+
+ // We either were unable to get tput/lat or all values were equal.
+ // Prefer the new opcode for reduced constant pool size.
+ return true;
+ };
+
auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
unsigned OperandNo) {
#ifdef EXPENSIVE_CHECKS
@@ -371,7 +402,11 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
for (const FixupEntry &Fixup : Fixups) {
- if (Fixup.Op) {
+ // Always uses the smallest possible constant load with opt/minsize,
+ // otherwise use the smallest instruction that doesn't affect
+ // performance.
+ // TODO: If constant has been hoisted from loop, use smallest constant.
+ if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup, RegBitWidth))) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
if (Constant *NewCst = Fixup.RebuildConstant(
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
index bb6f3fc9f588a..70b2d4be2fd81 100644
--- a/llvm/test/CodeGen/X86/avgceils.ll
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -39,7 +39,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -82,7 +82,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -365,7 +365,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -416,7 +416,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -875,7 +875,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
@@ -946,7 +946,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index 2566860357130..0508e5ccb5430 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -52,7 +52,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -107,7 +107,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -404,7 +404,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -477,7 +477,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -965,7 +965,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
@@ -1077,7 +1077,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index b21a0c4e36c2b..55478a2e93154 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 55b1cdeddb853..95b5fcf8eac52 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -836,7 +836,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: or_and_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,15,7]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
; AVX512-NEXT: retq
%1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 08262e4d34b26..8c91274abf3dd 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -29,16 +29,11 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295]
; SSE-NEXT: retq
;
-; AVX1-LABEL: or_zext_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: or_zext_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
-; AVX2-NEXT: # xmm0 = mem[0,0]
-; AVX2-NEXT: retq
+; AVX-LABEL: or_zext_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
+; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: retq
%1 = zext <2 x i32> %a0 to <2 x i64>
%2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
ret <2 x i64> %2
@@ -261,7 +256,7 @@ define i64 @PR89533(<64 x i8> %a0) {
;
; AVX2-LABEL: PR89533:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index 9e0114c017419..be835f78292e5 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -105,8 +105,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-LINUX-LABEL: store_repeated_constants:
; AVX-LINUX: # %bb.0:
-; AVX-LINUX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-LINUX-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-LINUX-NEXT: vmovaps %ymm0, (%rdi)
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-LINUX-NEXT: vmovaps %xmm0, %xmm1
@@ -119,8 +118,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-MSVC-LABEL: store_repeated_constants:
; AVX-MSVC: # %bb.0:
-; AVX-MSVC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-MSVC-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-MSVC-NEXT: vmovaps %ymm0, (%rcx)
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-MSVC-NEXT: vmovaps %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 59a61722927de..5519d9b787b7f 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -389,7 +389,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
;
; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -649,12 +649,26 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fdiv_pow2_8xhalf:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0
-; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
-; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-NO-FASTFMA: # %bb.0:
+; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT: retq
+;
+; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -1135,7 +1149,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 4ac829c0dfd53..1a2cfd69650b8 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -198,7 +198,7 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind {
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -576,7 +576,8 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -1023,7 +1024,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -2817,7 +2819,7 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) nounwind {
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -3190,7 +3192,8 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -3632,7 +3635,8 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
index af3b07bc131a9..9dd52fe6b7f44 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
@@ -52,33 +52,19 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
}
define <16 x i16> @pow2_mask_v16i16(i16 zeroext %0) {
-; SSE2-LABEL: pow2_mask_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,16384,8192,4096,2048,1024,512,256]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pcmpeqw %xmm3, %xmm0
-; SSE2-NEXT: pcmpeqw %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: pow2_mask_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32768,16384,8192,4096,2048,1024,512,256]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm2, %xmm1
-; SSE41-NEXT: retq
+; SSE-LABEL: pow2_mask_v16i16:
+; SSE: # %bb.0:
+; SSE-NEXT: movd %edi, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32768,16384,8192,4096,2048,1024,512,256]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: pcmpeqw %xmm3, %xmm0
+; SSE-NEXT: pcmpeqw %xmm2, %xmm1
+; SSE-NEXT: retq
;
; AVX2-LABEL: pow2_mask_v16i16:
; AVX2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll
index 5e31baa1ec72f..cab810d30cd77 100644
--- a/llvm/test/CodeGen/X86/kmov.ll
+++ b/llvm/test/CodeGen/X86/kmov.ll
@@ -194,7 +194,7 @@ define <8 x i1> @i16_mask_extract_8(i16 %mask) {
; X64-AVX512-LABEL: i16_mask_extract_8:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vpbroadcastw %edi, %xmm0
-; X64-AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; X64-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: retq
@@ -203,7 +203,7 @@ define <8 x i1> @i16_mask_extract_8(i16 %mask) {
; X64-KNL: # %bb.0:
; X64-KNL-NEXT: vmovd %edi, %xmm0
; X64-KNL-NEXT: vpbroadcastw %xmm0, %xmm0
-; X64-KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; X64-KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; X64-KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
; X64-KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; X64-KNL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
index ae422381c841c..ee42cb8126f74 100644
--- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -425,7 +425,7 @@ define <2 x i64> @reassociate_umax_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
; AVX2-LABEL: reassociate_umax_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5
; AVX2-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4
@@ -723,7 +723,7 @@ define <2 x i64> @reassociate_umin_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
; AVX2-LABEL: reassociate_umin_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5
; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 9b08d8baacee1..24c884211cf97 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -889,7 +889,7 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
@@ -912,7 +912,7 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5
; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
@@ -936,7 +936,7 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
@@ -964,7 +964,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
@@ -979,7 +979,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3
; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
@@ -994,7 +994,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI: # %bb.0:
; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
@@ -1127,7 +1127,7 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector
; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%a = load <16 x i32>, ptr %x
@@ -1182,7 +1182,7 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w
; CHECK-LABEL: trunc_v16i32_v16i16_sign:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%a = load <16 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/X86/pr30290.ll b/llvm/test/CodeGen/X86/pr30290.ll
index 478cb142475da..74e553191331f 100644
--- a/llvm/test/CodeGen/X86/pr30290.ll
+++ b/llvm/test/CodeGen/X86/pr30290.ll
@@ -20,7 +20,7 @@ define void @foo(ptr byval(%struct.face) nocapture align 8) local_unnamed_addr {
; CHECK: # %bb.0:
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index c798e74a0b231..cbc677229ede6 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() {
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
while.1.body.preheader:
%0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll
index 6bebbe3cee1f9..9ac3ee4de2a34 100644
--- a/llvm/test/CodeGen/X86/pr57340.ll
+++ b/llvm/test/CodeGen/X86/pr57340.ll
@@ -6,7 +6,7 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastw (%rax), %ymm0
; CHECK-NEXT: vmovdqu (%rax), %ymm1
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; CHECK-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; CHECK-NEXT: vcvtph2ps %ymm2, %zmm0
; CHECK-NEXT: vcvtph2ps %ymm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 4bbc1707e10c3..b5b80515fc6d9 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -117,7 +117,7 @@ define i32 @PR63108() {
; AVX512-NEXT: testb %al, %al
; AVX512-NEXT: je .LBB0_2
; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
+; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX512-NEXT: jmp .LBB0_5
; AVX512-NEXT: .LBB0_2: # %vector.body.preheader
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll
index 96f6a18819383..9c072e6f5e3fc 100644
--- a/llvm/test/CodeGen/X86/pr77459.ll
+++ b/llvm/test/CodeGen/X86/pr77459.ll
@@ -98,7 +98,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
@@ -157,7 +157,7 @@ define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
-; AVX512-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovw2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index bb8c074f97e5e..dab7a6aab2d82 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -333,11 +333,53 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: v4f32_no_estimate:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX-RECIP-LABEL: v4f32_no_estimate:
+; AVX-RECIP: # %bb.0:
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT: retq
+;
+; FMA-RECIP-LABEL: v4f32_no_estimate:
+; FMA-RECIP: # %bb.0:
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; FMA-RECIP-NEXT: retq
+;
+; BDVER2-LABEL: v4f32_no_estimate:
+; BDVER2: # %bb.0:
+; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT: retq
+;
+; BTVER2-LABEL: v4f32_no_estimate:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT: retq
+;
+; SANDY-LABEL: v4f32_no_estimate:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT: retq
+;
+; HASWELL-LABEL: v4f32_no_estimate:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT: retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT: retq
+;
+; AVX512-LABEL: v4f32_no_estimate:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -380,7 +422,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; BTVER2-LABEL: v4f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
@@ -567,7 +609,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; BDVER2-LABEL: v4f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %xmm0, %xmm1
-; BDVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
@@ -576,7 +618,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; BTVER2-LABEL: v4f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2
@@ -652,11 +694,53 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: v8f32_no_estimate:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: retq
+; AVX-RECIP-LABEL: v8f32_no_estimate:
+; AVX-RECIP: # %bb.0:
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT: retq
+;
+; FMA-RECIP-LABEL: v8f32_no_estimate:
+; FMA-RECIP: # %bb.0:
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; FMA-RECIP-NEXT: retq
+;
+; BDVER2-LABEL: v8f32_no_estimate:
+; BDVER2: # %bb.0:
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT: retq
+;
+; BTVER2-LABEL: v8f32_no_estimate:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT: retq
+;
+; SANDY-LABEL: v8f32_no_estimate:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT: retq
+;
+; HASWELL-LABEL: v8f32_no_estimate:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT: retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT: retq
+;
+; AVX512-LABEL: v8f32_no_estimate:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -718,7 +802,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -819,7 +903,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; BDVER2-LABEL: v8f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm1
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
@@ -844,7 +928,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -926,7 +1010,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
;
; BDVER2-LABEL: v16f32_no_estimate:
; BDVER2: # %bb.0:
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BDVER2-NEXT: retq
@@ -940,7 +1024,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
;
; SANDY-LABEL: v16f32_no_estimate:
; SANDY: # %bb.0:
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1
; SANDY-NEXT: retq
@@ -1030,7 +1114,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; BDVER2-LABEL: v16f32_one_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vrcpps %ymm1, %ymm4
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
@@ -1057,7 +1141,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vrcpps %ymm1, %ymm4
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
@@ -1204,7 +1288,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; BDVER2-LABEL: v16f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
@@ -1243,7 +1327,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 042069703af21..77ccaff15e42a 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -504,7 +504,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; BTVER2-LABEL: v4f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
@@ -632,7 +632,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; BTVER2-LABEL: v4f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
@@ -880,7 +880,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -1027,7 +1027,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -1360,7 +1360,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; BDVER2-LABEL: v16f32_one_step_2_divs:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
; BDVER2-NEXT: vrcpps %ymm1, %ymm2
@@ -1395,7 +1395,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
@@ -1572,7 +1572,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; BDVER2-LABEL: v16f32_two_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
@@ -1619,7 +1619,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 59aa96520070e..975ffd06db03b 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -101,7 +101,7 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SLM-LABEL: test_mul_v8i32_v8i8:
; SLM: # %bb.0:
-; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -199,7 +199,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLM-LABEL: test_mul_v16i32_v16i8:
; SLM: # %bb.0:
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 2b78a70ebcc26..74b51ac21dc1f 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -153,7 +153,7 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1
; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; SNB-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0
; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0
; SNB-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 384f8b832afb9..9f420bcede110 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -454,12 +454,19 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: divps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: v8f32_no_estimate:
-; AVX: # %bb.0:
-; AVX-NEXT: vsqrtps %ymm0, %ymm0
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: retq
+; AVX1-LABEL: v8f32_no_estimate:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vsqrtps %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: v8f32_no_estimate:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vsqrtps %ymm0, %ymm0
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
ret <8 x float> %div
@@ -530,7 +537,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
; AVX1: # %bb.0:
; AVX1-NEXT: vsqrtps %ymm1, %ymm1
; AVX1-NEXT: vsqrtps %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
@@ -580,11 +587,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
; AVX1-LABEL: v16f32_estimate:
; AVX1: # %bb.0:
; AVX1-NEXT: vrsqrtps %ymm0, %ymm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vrsqrtps %ymm1, %ymm5
; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0
diff --git a/llvm/test/CodeGen/X86/sse-domains.ll b/llvm/test/CodeGen/X86/sse-domains.ll
index 249425ec4dc9e..8a1b811b77e5d 100644
--- a/llvm/test/CodeGen/X86/sse-domains.ll
+++ b/llvm/test/CodeGen/X86/sse-domains.ll
@@ -17,7 +17,7 @@ define void @f(ptr nocapture %p, i32 %n) nounwind uwtable ssp {
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: addq $16, %rdi
; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: pmovsxbd {{.*#+}} xmm0 = [127,127,127,127]
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [127,127,127,127]
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB0_1: ## %while.body
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/trunc-vector-width.ll b/llvm/test/CodeGen/X86/trunc-vector-width.ll
index 42cc624b5a535..6cfe5b2841b8a 100644
--- a/llvm/test/CodeGen/X86/trunc-vector-width.ll
+++ b/llvm/test/CodeGen/X86/trunc-vector-width.ll
@@ -5,7 +5,7 @@ define void @test(ptr %a0) #0 {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %ymm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0]
+; CHECK-NEXT: vmovq {{.*#+}} xmm1 = [0,4,0,0]
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7]
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 7df80ee9f175b..995607a6857bd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -220,7 +220,7 @@ define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_0
; FAST: # %bb.0:
; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
+; FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; FAST-NEXT: retq
@@ -277,7 +277,7 @@ define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_z
define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) {
; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
@@ -292,7 +292,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_2
define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1]
+; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1>
@@ -302,7 +302,7 @@ define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<
define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a) {
; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1]
+; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1>
@@ -312,7 +312,7 @@ define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16
define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -322,7 +322,7 @@ define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovaps {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -339,7 +339,7 @@ define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_0
;
; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -355,7 +355,7 @@ define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05
;
; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -365,7 +365,7 @@ define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05
define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, ptr %b) {
; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
; ALL-NEXT: retq
%c = load <16 x float>, ptr %b
@@ -382,7 +382,7 @@ define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a
;
; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
; FAST-NEXT: retq
%1 = load <16 x float>, ptr %a1
@@ -393,7 +393,7 @@ define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a
define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, ptr %b) {
; ALL-LABEL: shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2d (%rdi), %zmm1, %zmm0
; ALL-NEXT: retq
%c = load <16 x i32>, ptr %b
@@ -421,7 +421,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
;
; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; FAST-NEXT: retq
@@ -441,7 +441,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
;
; FAST-LABEL: test_v16i32_0_1_2_12:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,1,2,12]
+; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; FAST-NEXT: vzeroupper
@@ -455,7 +455,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) {
; ALL-LABEL: test_v16i32_0_4_8_12:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,12]
+; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,4,8,12]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; ALL-NEXT: vzeroupper
@@ -478,7 +478,7 @@ define <8 x float> @shuffle_v16f32_extract_256(ptr %RET, ptr %a) {
define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10]
+; ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; ALL-NEXT: retq
@@ -490,7 +490,7 @@ define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_3_6:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,1,3,6]
+; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,3,6]
; ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
@@ -607,7 +607,7 @@ define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_u
define <16 x i32> @shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29(<16 x i32> %a, <16 x i32> %b) {
; AVX512F-LABEL: shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,8,3,10,5,12,7,14]
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,3,10,5,12,7,14]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 3cf2d42799381..2d2bd38f4f727 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -29,7 +29,7 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
;
; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; SKX: ## %bb.0:
-; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -42,20 +42,20 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_
; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
-; KNL-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,65535,0,0,65535,0,0,0,0,0,0,65535,0]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u]
; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15]
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
-; KNL-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,0,0,65535,0,65535,65535,0,65535,65535,0,0,65535]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255]
; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
; SKX: ## %bb.0:
-; SKX-NEXT: vpmovsxbw {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1,2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,31]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31]
; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1, i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 31>
@@ -82,7 +82,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
;
; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
; SKX: ## %bb.0:
-; SKX-NEXT: vpmovsxbw {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56>
@@ -205,7 +205,7 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
+; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index ef8893dc4caf0..efa6c16fbf4eb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2698,13 +2698,13 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
;
; SSE41-LABEL: combine_constant_insertion_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4,5,30]
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [u,4,5,30]
; SSE41-NEXT: pinsrd $0, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_constant_insertion_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,5,30]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,4,5,30]
; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a0 = insertelement <4 x i32> undef, i32 %f, i32 0
@@ -3355,7 +3355,7 @@ define void @PR45604(ptr %dst, ptr %src) {
; SSE41-NEXT: movdqa (%rsi), %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [0,11,0,11]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [u,0,11,0,u,0,11,0]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
@@ -3379,7 +3379,7 @@ define void @PR45604(ptr %dst, ptr %src) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [720907,11,720907,11,720907,11,720907,11]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [720907,11,720907,11,720907,11,720907,11]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -3395,7 +3395,7 @@ define void @PR45604(ptr %dst, ptr %src) {
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
-; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [151519488,185205506,218891524,252577542]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 65916aaf52f9e..a5d83a86f295e 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -331,7 +331,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; SKX-LABEL: trunc_usat_v4i64_v4i32:
; SKX: # %bb.0:
; SKX-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
-; SKX-NEXT: vpmovzxdq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
; SKX-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; SKX-NEXT: vpmovqd %ymm1, %xmm0
; SKX-NEXT: vzeroupper
More information about the llvm-commits
mailing list