[llvm] [X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions (PR #140028)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed May 28 00:38:22 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/140028
>From 4a4a8bae92bd83a252982a1fdc09a91df66902d7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 15 May 2025 10:00:04 +0100
Subject: [PATCH 1/4] [X86] X86FixupVectorConstantsPass - use scheduler model
to avoid regressions
When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower.
WIP - this currently ignores a couple of factors, including:
- what trade-off in throughput/latency/codesize is acceptable for smaller datasize
- identifying hoisted constant loads where the slower instruction might be acceptable
Fixes #135998
---
.../Target/X86/X86FixupVectorConstants.cpp | 55 ++++++-
llvm/test/CodeGen/X86/avgceils.ll | 12 +-
llvm/test/CodeGen/X86/avgfloors.ll | 12 +-
llvm/test/CodeGen/X86/avx512-build-vector.ll | 2 +-
llvm/test/CodeGen/X86/combine-or-shuffle.ll | 2 +-
llvm/test/CodeGen/X86/combine-or.ll | 17 +--
.../test/CodeGen/X86/constant-pool-sharing.ll | 6 +-
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 30 +++-
llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 16 +-
llvm/test/CodeGen/X86/icmp-pow2-mask.ll | 40 ++---
llvm/test/CodeGen/X86/kmov.ll | 4 +-
.../CodeGen/X86/machine-combiner-int-vec.ll | 4 +-
.../CodeGen/X86/min-legal-vector-width.ll | 16 +-
llvm/test/CodeGen/X86/pr30290.ll | 2 +-
llvm/test/CodeGen/X86/pr38639.ll | 5 +-
llvm/test/CodeGen/X86/pr46532.ll | 2 +-
llvm/test/CodeGen/X86/pr57340.ll | 2 +-
llvm/test/CodeGen/X86/pr63108.ll | 2 +-
llvm/test/CodeGen/X86/pr77459.ll | 4 +-
llvm/test/CodeGen/X86/recip-fastmath.ll | 138 ++++++++++++++----
llvm/test/CodeGen/X86/recip-fastmath2.ll | 24 +--
llvm/test/CodeGen/X86/slow-pmulld.ll | 4 +-
llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll | 2 +-
llvm/test/CodeGen/X86/sqrt-fastmath.ll | 25 ++--
llvm/test/CodeGen/X86/sse-domains.ll | 2 +-
llvm/test/CodeGen/X86/trunc-vector-width.ll | 2 +-
.../CodeGen/X86/vector-shuffle-512-v16.ll | 34 ++---
.../CodeGen/X86/vector-shuffle-512-v32.ll | 12 +-
.../CodeGen/X86/vector-shuffle-combining.ll | 10 +-
llvm/test/CodeGen/X86/vector-trunc-usat.ll | 2 +-
30 files changed, 314 insertions(+), 174 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index a415a45775984..7a28a79727482 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -336,6 +336,13 @@ static Constant *rebuildZExtCst(const Constant *C, unsigned NumBits,
return rebuildExtCst(C, false, NumBits, NumElts, SrcEltBitWidth);
}
+template <typename T>
+static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
+ if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
+ return *NewVal < *CurVal;
+ return std::nullopt;
+}
+
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
@@ -348,6 +355,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();
bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
+ bool OptSize = MF.getFunction().hasOptSize();
struct FixupEntry {
int Op;
@@ -356,6 +364,51 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
std::function<Constant *(const Constant *, unsigned, unsigned, unsigned)>
RebuildConstant;
};
+
+ auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
+ // We already checked that SchedModel exists in `NewOpcPreferable`.
+ return MCSchedModel::getReciprocalThroughput(
+ *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
+ };
+ auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
+ // We already checked that SchedModel exists in `NewOpcPreferable`.
+ return MCSchedModel::computeInstrLatency(
+ *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
+ };
+ auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
+ if (unsigned Size = TII->get(Opcode).getSize())
+ return Size;
+ // Zero size means we where unable to compute it.
+ return std::nullopt;
+ };
+
+ auto NewOpcPreferable = [&](const FixupEntry &Fixup) -> bool {
+ unsigned NewOpc = Fixup.Op;
+
+ std::optional<bool> Res;
+ if (SM->hasInstrSchedModel()) {
+ // Compare tput -> lat -> code size.
+ // TODO: how much increase in tput/latency/size should we permit for the
+ // reduction in constant pool size?
+ Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
+ if (Res.has_value())
+ return *Res;
+
+ Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
+ if (Res.has_value())
+ return *Res;
+ }
+
+ // TODO: Include data reduction in size comparison?
+ Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
+ if (Res.has_value())
+ return *Res;
+
+ // We either were unable to get tput/lat/codesize or all values were equal.
+ // Prefer the new opcode for reduced constant pool size.
+ return true;
+ };
+
auto FixupConstant = [&](ArrayRef<FixupEntry> Fixups, unsigned RegBitWidth,
unsigned OperandNo) {
#ifdef EXPENSIVE_CHECKS
@@ -372,7 +425,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
for (const FixupEntry &Fixup : Fixups) {
- if (Fixup.Op) {
+ if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup))) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
if (Constant *NewCst = Fixup.RebuildConstant(
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
index bb6f3fc9f588a..70b2d4be2fd81 100644
--- a/llvm/test/CodeGen/X86/avgceils.ll
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -39,7 +39,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -82,7 +82,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -365,7 +365,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -416,7 +416,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@@ -875,7 +875,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_fixed_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
@@ -946,7 +946,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
;
; AVX512-LABEL: test_ext_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpavgb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index 2566860357130..0508e5ccb5430 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -52,7 +52,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -107,7 +107,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -404,7 +404,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -477,7 +477,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0
@@ -965,7 +965,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
@@ -1077,7 +1077,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index b21a0c4e36c2b..55478a2e93154 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 55b1cdeddb853..95b5fcf8eac52 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -836,7 +836,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
;
; AVX512-LABEL: or_and_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,15,7]
; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
; AVX512-NEXT: retq
%1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 08262e4d34b26..8c91274abf3dd 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -29,16 +29,11 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295]
; SSE-NEXT: retq
;
-; AVX1-LABEL: or_zext_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: or_zext_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
-; AVX2-NEXT: # xmm0 = mem[0,0]
-; AVX2-NEXT: retq
+; AVX-LABEL: or_zext_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
+; AVX-NEXT: # xmm0 = mem[0,0]
+; AVX-NEXT: retq
%1 = zext <2 x i32> %a0 to <2 x i64>
%2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
ret <2 x i64> %2
@@ -261,7 +256,7 @@ define i64 @PR89533(<64 x i8> %a0) {
;
; AVX2-LABEL: PR89533:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index 9e0114c017419..be835f78292e5 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -105,8 +105,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-LINUX-LABEL: store_repeated_constants:
; AVX-LINUX: # %bb.0:
-; AVX-LINUX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-LINUX-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-LINUX-NEXT: vmovaps %ymm0, (%rdi)
; AVX-LINUX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-LINUX-NEXT: vmovaps %xmm0, %xmm1
@@ -119,8 +118,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
;
; AVX-MSVC-LABEL: store_repeated_constants:
; AVX-MSVC: # %bb.0:
-; AVX-MSVC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
-; AVX-MSVC-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0]
; AVX-MSVC-NEXT: vmovaps %ymm0, (%rcx)
; AVX-MSVC-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,0,0,18446744073709551615]
; AVX-MSVC-NEXT: vmovaps %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 59a61722927de..5519d9b787b7f 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -389,7 +389,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
;
; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -649,12 +649,26 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fdiv_pow2_8xhalf:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: vpsllw $10, %xmm0, %xmm0
-; CHECK-AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
-; CHECK-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-NO-FASTFMA: # %bb.0:
+; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-NO-FASTFMA-NEXT: retq
+;
+; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
+; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -1135,7 +1149,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 4ac829c0dfd53..1a2cfd69650b8 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -198,7 +198,7 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind {
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -576,7 +576,8 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -1023,7 +1024,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -2817,7 +2819,7 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) nounwind {
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -3190,7 +3192,8 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -3632,7 +3635,8 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
index af3b07bc131a9..9dd52fe6b7f44 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
@@ -52,33 +52,19 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
}
define <16 x i16> @pow2_mask_v16i16(i16 zeroext %0) {
-; SSE2-LABEL: pow2_mask_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,16384,8192,4096,2048,1024,512,256]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pcmpeqw %xmm3, %xmm0
-; SSE2-NEXT: pcmpeqw %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: pow2_mask_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32768,16384,8192,4096,2048,1024,512,256]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm2, %xmm1
-; SSE41-NEXT: retq
+; SSE-LABEL: pow2_mask_v16i16:
+; SSE: # %bb.0:
+; SSE-NEXT: movd %edi, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32768,16384,8192,4096,2048,1024,512,256]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: pcmpeqw %xmm3, %xmm0
+; SSE-NEXT: pcmpeqw %xmm2, %xmm1
+; SSE-NEXT: retq
;
; AVX2-LABEL: pow2_mask_v16i16:
; AVX2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll
index 5e31baa1ec72f..cab810d30cd77 100644
--- a/llvm/test/CodeGen/X86/kmov.ll
+++ b/llvm/test/CodeGen/X86/kmov.ll
@@ -194,7 +194,7 @@ define <8 x i1> @i16_mask_extract_8(i16 %mask) {
; X64-AVX512-LABEL: i16_mask_extract_8:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vpbroadcastw %edi, %xmm0
-; X64-AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; X64-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: retq
@@ -203,7 +203,7 @@ define <8 x i1> @i16_mask_extract_8(i16 %mask) {
; X64-KNL: # %bb.0:
; X64-KNL-NEXT: vmovd %edi, %xmm0
; X64-KNL-NEXT: vpbroadcastw %xmm0, %xmm0
-; X64-KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; X64-KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; X64-KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
; X64-KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; X64-KNL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
index ae422381c841c..ee42cb8126f74 100644
--- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -425,7 +425,7 @@ define <2 x i64> @reassociate_umax_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
; AVX2-LABEL: reassociate_umax_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5
; AVX2-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4
@@ -723,7 +723,7 @@ define <2 x i64> @reassociate_umin_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>
; AVX2-LABEL: reassociate_umin_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5
; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 9b08d8baacee1..24c884211cf97 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -889,7 +889,7 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
@@ -912,7 +912,7 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5
; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
@@ -936,7 +936,7 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
@@ -964,7 +964,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
@@ -979,7 +979,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3
; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
@@ -994,7 +994,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI: # %bb.0:
; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
@@ -1127,7 +1127,7 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector
; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%a = load <16 x i32>, ptr %x
@@ -1182,7 +1182,7 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w
; CHECK-LABEL: trunc_v16i32_v16i16_sign:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%a = load <16 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/X86/pr30290.ll b/llvm/test/CodeGen/X86/pr30290.ll
index 478cb142475da..74e553191331f 100644
--- a/llvm/test/CodeGen/X86/pr30290.ll
+++ b/llvm/test/CodeGen/X86/pr30290.ll
@@ -20,7 +20,7 @@ define void @foo(ptr byval(%struct.face) nocapture align 8) local_unnamed_addr {
; CHECK: # %bb.0:
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll
index 15cc7581454aa..3674bdc180527 100644
--- a/llvm/test/CodeGen/X86/pr38639.ll
+++ b/llvm/test/CodeGen/X86/pr38639.ll
@@ -4,12 +4,11 @@
define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [u,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
-; CHECK-NEXT: # xmm2 = mem[0,0]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: retq
%1 = shufflevector <4 x double> %a, <4 x double> <double undef, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C>, <8 x i32> <i32 6, i32 5, i32 2, i32 3, i32 5, i32 1, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index c798e74a0b231..cbc677229ede6 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() {
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,0]
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
while.1.body.preheader:
%0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll
index 6bebbe3cee1f9..9ac3ee4de2a34 100644
--- a/llvm/test/CodeGen/X86/pr57340.ll
+++ b/llvm/test/CodeGen/X86/pr57340.ll
@@ -6,7 +6,7 @@ define void @main.41() local_unnamed_addr #1 {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastw (%rax), %ymm0
; CHECK-NEXT: vmovdqu (%rax), %ymm1
-; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; CHECK-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; CHECK-NEXT: vcvtph2ps %ymm2, %zmm0
; CHECK-NEXT: vcvtph2ps %ymm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 4bbc1707e10c3..b5b80515fc6d9 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -117,7 +117,7 @@ define i32 @PR63108() {
; AVX512-NEXT: testb %al, %al
; AVX512-NEXT: je .LBB0_2
; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
+; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
; AVX512-NEXT: jmp .LBB0_5
; AVX512-NEXT: .LBB0_2: # %vector.body.preheader
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0]
diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll
index 96f6a18819383..9c072e6f5e3fc 100644
--- a/llvm/test/CodeGen/X86/pr77459.ll
+++ b/llvm/test/CodeGen/X86/pr77459.ll
@@ -98,7 +98,7 @@ define i8 @reverse_cmp_v8i1(<8 x i16> %a0, <8 x i16> %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
@@ -157,7 +157,7 @@ define i16 @reverse_cmp_v16i1(<16 x i8> %a0, <16 x i8> %a1) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
-; AVX512-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovw2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index bb8c074f97e5e..9728057e8f574 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -333,11 +333,53 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: v4f32_no_estimate:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX-RECIP-LABEL: v4f32_no_estimate:
+; AVX-RECIP: # %bb.0:
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT: retq
+;
+; FMA-RECIP-LABEL: v4f32_no_estimate:
+; FMA-RECIP: # %bb.0:
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; FMA-RECIP-NEXT: retq
+;
+; BDVER2-LABEL: v4f32_no_estimate:
+; BDVER2: # %bb.0:
+; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; BDVER2-NEXT: retq
+;
+; BTVER2-LABEL: v4f32_no_estimate:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT: retq
+;
+; SANDY-LABEL: v4f32_no_estimate:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT: retq
+;
+; HASWELL-LABEL: v4f32_no_estimate:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT: retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT: retq
+;
+; AVX512-LABEL: v4f32_no_estimate:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: retq
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -380,7 +422,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; BTVER2-LABEL: v4f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
@@ -567,7 +609,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; BDVER2-LABEL: v4f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %xmm0, %xmm1
-; BDVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
@@ -576,7 +618,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; BTVER2-LABEL: v4f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2
@@ -652,11 +694,53 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: v8f32_no_estimate:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: retq
+; AVX-RECIP-LABEL: v8f32_no_estimate:
+; AVX-RECIP: # %bb.0:
+; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT: retq
+;
+; FMA-RECIP-LABEL: v8f32_no_estimate:
+; FMA-RECIP: # %bb.0:
+; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; FMA-RECIP-NEXT: retq
+;
+; BDVER2-LABEL: v8f32_no_estimate:
+; BDVER2: # %bb.0:
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; BDVER2-NEXT: retq
+;
+; BTVER2-LABEL: v8f32_no_estimate:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT: retq
+;
+; SANDY-LABEL: v8f32_no_estimate:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT: retq
+;
+; HASWELL-LABEL: v8f32_no_estimate:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT: retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT: retq
+;
+; AVX512-LABEL: v8f32_no_estimate:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -706,7 +790,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; BTVER2-LABEL: v8f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -718,7 +802,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -819,7 +903,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; BDVER2-LABEL: v8f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm1
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
@@ -828,7 +912,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; BTVER2-LABEL: v8f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
@@ -844,7 +928,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -926,21 +1010,21 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
;
; BDVER2-LABEL: v16f32_no_estimate:
; BDVER2: # %bb.0:
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v16f32_no_estimate:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v16f32_no_estimate:
; SANDY: # %bb.0:
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1
; SANDY-NEXT: retq
@@ -1030,7 +1114,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; BDVER2-LABEL: v16f32_one_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vrcpps %ymm1, %ymm4
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
@@ -1040,7 +1124,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
;
; BTVER2-LABEL: v16f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vrcpps %ymm1, %ymm4
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -1057,7 +1141,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vrcpps %ymm1, %ymm4
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
@@ -1204,7 +1288,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; BDVER2-LABEL: v16f32_two_step:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
@@ -1218,7 +1302,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
;
; BTVER2-LABEL: v16f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
@@ -1243,7 +1327,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 042069703af21..64b44fab9546b 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -504,7 +504,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; BTVER2-LABEL: v4f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
@@ -632,7 +632,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; BTVER2-LABEL: v4f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
@@ -866,7 +866,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
;
; BTVER2-LABEL: v8f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -880,7 +880,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -1009,7 +1009,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
;
; BTVER2-LABEL: v8f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
@@ -1027,7 +1027,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -1360,7 +1360,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; BDVER2-LABEL: v16f32_one_step_2_divs:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
; BDVER2-NEXT: vrcpps %ymm1, %ymm2
@@ -1374,7 +1374,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
;
; BTVER2-LABEL: v16f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
@@ -1395,7 +1395,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
@@ -1572,7 +1572,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; BDVER2-LABEL: v16f32_two_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
@@ -1590,7 +1590,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
;
; BTVER2-LABEL: v16f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
@@ -1619,7 +1619,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
-; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 59aa96520070e..975ffd06db03b 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -101,7 +101,7 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SLM-LABEL: test_mul_v8i32_v8i8:
; SLM: # %bb.0:
-; SLM-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -199,7 +199,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLM-LABEL: test_mul_v16i32_v16i8:
; SLM: # %bb.0:
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLM-NEXT: pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 2b78a70ebcc26..74b51ac21dc1f 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -153,7 +153,7 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1
; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; SNB-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
+; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0
; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0
; SNB-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 384f8b832afb9..9f420bcede110 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -454,12 +454,19 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: divps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: v8f32_no_estimate:
-; AVX: # %bb.0:
-; AVX-NEXT: vsqrtps %ymm0, %ymm0
-; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: retq
+; AVX1-LABEL: v8f32_no_estimate:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vsqrtps %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: v8f32_no_estimate:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vsqrtps %ymm0, %ymm0
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
ret <8 x float> %div
@@ -530,7 +537,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
; AVX1: # %bb.0:
; AVX1-NEXT: vsqrtps %ymm1, %ymm1
; AVX1-NEXT: vsqrtps %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
@@ -580,11 +587,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
; AVX1-LABEL: v16f32_estimate:
; AVX1: # %bb.0:
; AVX1-NEXT: vrsqrtps %ymm0, %ymm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vrsqrtps %ymm1, %ymm5
; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0
diff --git a/llvm/test/CodeGen/X86/sse-domains.ll b/llvm/test/CodeGen/X86/sse-domains.ll
index 249425ec4dc9e..8a1b811b77e5d 100644
--- a/llvm/test/CodeGen/X86/sse-domains.ll
+++ b/llvm/test/CodeGen/X86/sse-domains.ll
@@ -17,7 +17,7 @@ define void @f(ptr nocapture %p, i32 %n) nounwind uwtable ssp {
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: addq $16, %rdi
; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: pmovsxbd {{.*#+}} xmm0 = [127,127,127,127]
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [127,127,127,127]
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB0_1: ## %while.body
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/trunc-vector-width.ll b/llvm/test/CodeGen/X86/trunc-vector-width.ll
index 42cc624b5a535..6cfe5b2841b8a 100644
--- a/llvm/test/CodeGen/X86/trunc-vector-width.ll
+++ b/llvm/test/CodeGen/X86/trunc-vector-width.ll
@@ -5,7 +5,7 @@ define void @test(ptr %a0) #0 {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %ymm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0]
+; CHECK-NEXT: vmovq {{.*#+}} xmm1 = [0,4,0,0]
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7]
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 7df80ee9f175b..995607a6857bd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -220,7 +220,7 @@ define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_0
; FAST: # %bb.0:
; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
+; FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; FAST-NEXT: retq
@@ -277,7 +277,7 @@ define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_z
define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) {
; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
@@ -292,7 +292,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_2
define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1]
+; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1>
@@ -302,7 +302,7 @@ define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<
define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a) {
; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1]
+; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1>
@@ -312,7 +312,7 @@ define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16
define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -322,7 +322,7 @@ define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovaps {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -339,7 +339,7 @@ define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_0
;
; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -355,7 +355,7 @@ define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05
;
; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: retq
%1 = shufflevector <16 x float> %a, <16 x float> poison, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -365,7 +365,7 @@ define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05
define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, ptr %b) {
; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
; ALL-NEXT: retq
%c = load <16 x float>, ptr %b
@@ -382,7 +382,7 @@ define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a
;
; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
; FAST-NEXT: retq
%1 = load <16 x float>, ptr %a1
@@ -393,7 +393,7 @@ define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a
define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, ptr %b) {
; ALL-LABEL: shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2d (%rdi), %zmm1, %zmm0
; ALL-NEXT: retq
%c = load <16 x i32>, ptr %b
@@ -421,7 +421,7 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
;
; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; FAST-NEXT: retq
@@ -441,7 +441,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
;
; FAST-LABEL: test_v16i32_0_1_2_12:
; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,1,2,12]
+; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; FAST-NEXT: vzeroupper
@@ -455,7 +455,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) {
; ALL-LABEL: test_v16i32_0_4_8_12:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,12]
+; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,4,8,12]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; ALL-NEXT: vzeroupper
@@ -478,7 +478,7 @@ define <8 x float> @shuffle_v16f32_extract_256(ptr %RET, ptr %a) {
define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10]
+; ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10]
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; ALL-NEXT: retq
@@ -490,7 +490,7 @@ define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_3_6:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,1,3,6]
+; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,3,6]
; ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
@@ -607,7 +607,7 @@ define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_u
define <16 x i32> @shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29(<16 x i32> %a, <16 x i32> %b) {
; AVX512F-LABEL: shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,8,3,10,5,12,7,14]
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,3,10,5,12,7,14]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 3cf2d42799381..2d2bd38f4f727 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -29,7 +29,7 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
;
; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
; SKX: ## %bb.0:
-; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -42,20 +42,20 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_
; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
-; KNL-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,65535,0,0,65535,0,0,0,0,0,0,65535,0]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u]
; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15]
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
-; KNL-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,0,0,65535,0,65535,65535,0,65535,65535,0,0,65535]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255]
; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
; SKX: ## %bb.0:
-; SKX-NEXT: vpmovsxbw {{.*#+}} zmm1 = [2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,1,2,5,0,0,7,0,10,1,0,5,0,4,7,0,10,31]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31]
; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> <i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 1, i32 2, i32 5, i32 poison, i32 poison, i32 7, i32 poison, i32 10, i32 1, i32 0, i32 5, i32 poison, i32 4, i32 7, i32 poison, i32 10, i32 31>
@@ -82,7 +82,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
;
; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
; SKX: ## %bb.0:
-; SKX-NEXT: vpmovsxbw {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56>
@@ -205,7 +205,7 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
+; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 68040b58858a7..0378e7092d4b7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2698,13 +2698,13 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
;
; SSE41-LABEL: combine_constant_insertion_v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4,5,30]
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [u,4,5,30]
; SSE41-NEXT: pinsrd $0, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_constant_insertion_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,5,30]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,4,5,30]
; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a0 = insertelement <4 x i32> undef, i32 %f, i32 0
@@ -3355,7 +3355,7 @@ define void @PR45604(ptr %dst, ptr %src) {
; SSE41-NEXT: movdqa (%rsi), %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [0,11,0,11]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [u,0,11,0,u,0,11,0]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
@@ -3379,7 +3379,7 @@ define void @PR45604(ptr %dst, ptr %src) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [720907,11,720907,11,720907,11,720907,11]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [720907,11,720907,11,720907,11,720907,11]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -3395,7 +3395,7 @@ define void @PR45604(ptr %dst, ptr %src) {
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
-; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [151519488,185205506,218891524,252577542]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 65916aaf52f9e..a5d83a86f295e 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -331,7 +331,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; SKX-LABEL: trunc_usat_v4i64_v4i32:
; SKX: # %bb.0:
; SKX-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
-; SKX-NEXT: vpmovzxdq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
; SKX-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; SKX-NEXT: vpmovqd %ymm1, %xmm0
; SKX-NEXT: vzeroupper
>From 7609534cd7e399d163e933d86064434927f069ea Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 15 May 2025 12:05:40 +0100
Subject: [PATCH 2/4] Simplify cost comparisons - assume any instruction size
increase will always be less than the savings in the constant pool (8-bytes
or more)
---
.../Target/X86/X86FixupVectorConstants.cpp | 58 +++++--------------
1 file changed, 16 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 7a28a79727482..90d7745dcf24c 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -336,13 +336,6 @@ static Constant *rebuildZExtCst(const Constant *C, unsigned NumBits,
return rebuildExtCst(C, false, NumBits, NumElts, SrcEltBitWidth);
}
-template <typename T>
-static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
- if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
- return *NewVal < *CurVal;
- return std::nullopt;
-}
-
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
@@ -365,46 +358,27 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
RebuildConstant;
};
- auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
- // We already checked that SchedModel exists in `NewOpcPreferable`.
- return MCSchedModel::getReciprocalThroughput(
- *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
- };
- auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
- // We already checked that SchedModel exists in `NewOpcPreferable`.
- return MCSchedModel::computeInstrLatency(
- *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
- };
- auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
- if (unsigned Size = TII->get(Opcode).getSize())
- return Size;
- // Zero size means we where unable to compute it.
- return std::nullopt;
- };
-
auto NewOpcPreferable = [&](const FixupEntry &Fixup) -> bool {
- unsigned NewOpc = Fixup.Op;
-
- std::optional<bool> Res;
if (SM->hasInstrSchedModel()) {
- // Compare tput -> lat -> code size.
- // TODO: how much increase in tput/latency/size should we permit for the
+ // TODO: how much increase in tput/latency should we permit for the
// reduction in constant pool size?
- Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
- if (Res.has_value())
- return *Res;
-
- Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
- if (Res.has_value())
- return *Res;
+ unsigned NewOpc = Fixup.Op;
+ auto *OldDesc = SM->getSchedClassDesc(TII->get(Opc).getSchedClass());
+ auto *NewDesc = SM->getSchedClassDesc(TII->get(NewOpc).getSchedClass());
+
+ // Compare tput -> lat
+ double OldTput = MCSchedModel::getReciprocalThroughput(*ST, *OldDesc);
+ double NewTput = MCSchedModel::getReciprocalThroughput(*ST, *NewDesc);
+ if (OldTput != NewTput)
+ return NewTput < OldTput;
+
+ int OldLat = MCSchedModel::computeInstrLatency(*ST, *OldDesc);
+ int NewLat = MCSchedModel::computeInstrLatency(*ST, *NewDesc);
+ if (OldLat != NewLat)
+ return NewLat < OldLat;
}
- // TODO: Include data reduction in size comparison?
- Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
- if (Res.has_value())
- return *Res;
-
- // We either were unable to get tput/lat/codesize or all values were equal.
+ // We either were unable to get tput/lat or all values were equal.
// Prefer the new opcode for reduced constant pool size.
return true;
};
>From 54cbcc8ebac98948d9fadb32eb6fb4309fb8399e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 15 May 2025 12:42:48 +0100
Subject: [PATCH 3/4] Add an latency tolerance based on the reduction in data
load size
Allow a 1 cycle latency regression per 128-bits of constant data saved (very basic rule of thumb).
---
llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 16 ++++++++++------
llvm/test/CodeGen/X86/pr38639.ll | 2 +-
llvm/test/CodeGen/X86/recip-fastmath.ll | 12 ++++++------
llvm/test/CodeGen/X86/recip-fastmath2.ll | 8 ++++----
4 files changed, 21 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 90d7745dcf24c..95bfaf57453b2 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -358,24 +358,28 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
RebuildConstant;
};
- auto NewOpcPreferable = [&](const FixupEntry &Fixup) -> bool {
+ auto NewOpcPreferable = [&](const FixupEntry &Fixup,
+ unsigned RegBitWidth) -> bool {
if (SM->hasInstrSchedModel()) {
- // TODO: how much increase in tput/latency should we permit for the
- // reduction in constant pool size?
unsigned NewOpc = Fixup.Op;
auto *OldDesc = SM->getSchedClassDesc(TII->get(Opc).getSchedClass());
auto *NewDesc = SM->getSchedClassDesc(TII->get(NewOpc).getSchedClass());
+ unsigned BitsSaved = RegBitWidth - (Fixup.NumCstElts * Fixup.MemBitWidth);
- // Compare tput -> lat
+ // Compare tput/lat - avoid any regressions, but allow extra cycle of
+ // latency in exchange for each 128-bit (or less) constant pool reduction
+ // (this is a very simple cost:benefit estimate - there will probably be
+ // better ways to calculate this).
double OldTput = MCSchedModel::getReciprocalThroughput(*ST, *OldDesc);
double NewTput = MCSchedModel::getReciprocalThroughput(*ST, *NewDesc);
if (OldTput != NewTput)
return NewTput < OldTput;
+ int LatTol = (BitsSaved + 127) / 128;
int OldLat = MCSchedModel::computeInstrLatency(*ST, *OldDesc);
int NewLat = MCSchedModel::computeInstrLatency(*ST, *NewDesc);
if (OldLat != NewLat)
- return NewLat < OldLat;
+ return NewLat < (OldLat + LatTol);
}
// We either were unable to get tput/lat or all values were equal.
@@ -399,7 +403,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
for (const FixupEntry &Fixup : Fixups) {
- if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup))) {
+ if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup, RegBitWidth))) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
if (Constant *NewCst = Fixup.RebuildConstant(
diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll
index 3674bdc180527..d4da95a00aab2 100644
--- a/llvm/test/CodeGen/X86/pr38639.ll
+++ b/llvm/test/CodeGen/X86/pr38639.ll
@@ -4,7 +4,7 @@
define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [u,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
+; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1]
; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index 9728057e8f574..dab7a6aab2d82 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -714,7 +714,7 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
;
; BTVER2-LABEL: v8f32_no_estimate:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0
; BTVER2-NEXT: retq
;
@@ -790,7 +790,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; BTVER2-LABEL: v8f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -912,7 +912,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; BTVER2-LABEL: v8f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
@@ -1017,7 +1017,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
;
; BTVER2-LABEL: v16f32_no_estimate:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0
; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1
; BTVER2-NEXT: retq
@@ -1124,7 +1124,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
;
; BTVER2-LABEL: v16f32_one_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vrcpps %ymm1, %ymm4
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
@@ -1302,7 +1302,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
;
; BTVER2-LABEL: v16f32_two_step:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 64b44fab9546b..77ccaff15e42a 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -866,7 +866,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
;
; BTVER2-LABEL: v8f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
@@ -1009,7 +1009,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
;
; BTVER2-LABEL: v8f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
@@ -1374,7 +1374,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
;
; BTVER2-LABEL: v16f32_one_step_2_divs:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
@@ -1590,7 +1590,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
;
; BTVER2-LABEL: v16f32_two_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
>From 60dc4a91d1988b93bd505c247c0e6d59f51d86da Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 19 May 2025 17:43:30 +0100
Subject: [PATCH 4/4] Add description
---
llvm/lib/Target/X86/X86FixupVectorConstants.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 95bfaf57453b2..5d3b49b733615 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -403,6 +403,10 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
unsigned CstBitWidth = C->getType()->getPrimitiveSizeInBits();
RegBitWidth = RegBitWidth ? RegBitWidth : CstBitWidth;
for (const FixupEntry &Fixup : Fixups) {
+ // Always uses the smallest possible constant load with opt/minsize,
+ // otherwise use the smallest instruction that doesn't affect
+ // performance.
+ // TODO: If constant has been hoisted from loop, use smallest constant.
if (Fixup.Op && (OptSize || NewOpcPreferable(Fixup, RegBitWidth))) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
More information about the llvm-commits
mailing list