[llvm] [WIP][RFC][X86] Remove MOVSS/D -> BLENDPS/D conversions from DAG/ISEL (PR #144338)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 16 06:01:01 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/144338
>From 86e31070237e9cb219336795ebcb2026cd51736e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 16 Jun 2025 12:39:01 +0100
Subject: [PATCH] [X86] Remove MOVSS/D -> BLENDPS/D conversions from DAG/ISEL
This patch attempts to remove most of the MOVSS/D vs BLENDPS/D OptForSize/OptForSpeed instruction selection as possible and lets later domain switching and X86FixupInstTuning passes handle it.
(V)MOVSS/D instructions are created in all cases, which also avoids AVX512 getting stuck with VBLENDPS/D VEC inctructions restricting register usage.
getExecutionDomainCustom can now convert MOVSS/D to BLENDW/PS to support domain switches and X86FixupInstTuning can convert VMOVSS/D back to VBLENDPS/D if the scheduler model prefers it (and not building for OptSize).
Fixes #142972
---
llvm/lib/Target/X86/X86FixupInstTuning.cpp | 28 +++++
llvm/lib/Target/X86/X86InstrAVX512.td | 20 +--
llvm/lib/Target/X86/X86InstrInfo.cpp | 57 +++++++++
llvm/lib/Target/X86/X86InstrSSE.td | 76 +++---------
.../CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 16 ++-
.../test/CodeGen/X86/avx512copy-intrinsics.ll | 2 +-
llvm/test/CodeGen/X86/dpbusd.ll | 1 -
llvm/test/CodeGen/X86/dpbusd_const.ll | 8 +-
.../CodeGen/X86/sse-intrinsics-fast-isel.ll | 115 ++++++++++++------
.../CodeGen/X86/sse2-intrinsics-fast-isel.ll | 92 +++++++++-----
.../X86/sse2-intrinsics-x86-upgrade.ll | 4 +-
.../X86/sse41-intrinsics-x86-upgrade.ll | 16 ++-
llvm/test/CodeGen/X86/sse41.ll | 42 ++++---
llvm/test/CodeGen/X86/vec_ss_load_fold.ll | 81 ++++--------
.../CodeGen/X86/vector-half-conversions.ll | 8 +-
.../CodeGen/X86/vector-reduce-add-mask.ll | 1 -
.../CodeGen/X86/vector-reduce-add-zext.ll | 1 -
llvm/test/CodeGen/X86/vector-reduce-add.ll | 1 -
18 files changed, 331 insertions(+), 238 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 33dc0a232815c..099a7147d81a1 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -54,6 +54,7 @@ class X86FixupInstTuningPass : public MachineFunctionPass {
private:
const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
const X86Subtarget *ST = nullptr;
const MCSchedModel *SM = nullptr;
};
@@ -277,6 +278,18 @@ bool X86FixupInstTuningPass::processInstruction(
return true;
};
+ auto ProcessMOVToBLEND = [&](unsigned BlendOpc, unsigned BlendImm) -> bool {
+ if (OptSize || !NewOpcPreferable(BlendOpc, /*ReplaceInTie*/ false))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+ {
+ MI.setDesc(TII->get(BlendOpc));
+ MI.addOperand(MachineOperand::CreateImm(BlendImm));
+ }
+ LLVM_DEBUG(dbgs() << " With: " << MI);
+ return true;
+ };
+
switch (Opc) {
case X86::BLENDPDrri:
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -296,6 +309,20 @@ bool X86FixupInstTuningPass::processInstruction(
// TODO: Add X86::VPBLENDWYrmi handling
return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
+ case X86::VMOVSSZrr:
+ if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+ TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+ TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+ return false;
+ return ProcessMOVToBLEND(X86::VBLENDPSrri, 0x01);
+
+ case X86::VMOVSDZrr:
+ if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+ TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+ TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+ return false;
+ return ProcessMOVToBLEND(X86::VBLENDPDrri, 0x01);
+
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
case X86::VPERMILPDYri:
@@ -573,6 +600,7 @@ bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
ST = &MF.getSubtarget<X86Subtarget>();
TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
SM = &ST->getSchedModel();
for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0ab94cca41425..d369a2d8e9f68 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3904,13 +3904,12 @@ def : Pat<(f64 (bitconvert VK64:$src)),
multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
X86VectorVTInfo _, Predicate prd = HasAVX512> {
- let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
+ let Predicates = [prd] in {
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
_.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
- let Predicates = [prd] in {
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -4394,7 +4393,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
(VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
VR128X:$src1, VR128X:$src2), 0>;
-let Predicates = [HasAVX512, OptForSize] in {
+let Predicates = [HasAVX512] in {
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
(VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
@@ -4420,21 +4419,6 @@ let Predicates = [HasAVX512, OptForSize] in {
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
}
-// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
-// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
-let Predicates = [HasAVX512, OptForSpeed] in {
- def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
- (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
- (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
- (i8 3))), sub_xmm)>;
-}
-
let Predicates = [HasAVX512] in {
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
(VMOVSSZrm addr:$src)>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365eedec39..d97f424f808e5 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -9073,6 +9073,30 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
case X86::VPBLENDWYrmi:
case X86::VPBLENDWYrri:
return GetBlendDomains(8, false);
+ case X86::VMOVSSZrr:
+ // Only convert to BLEND if we are VEX compatible.
+ if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+ RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+ RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+ return 0;
+ [[fallthrough]];
+ case X86::MOVSSrr:
+ case X86::VMOVSSrr:
+ if (Subtarget.hasSSE41())
+ return 0x2 | 0x8; // PackedSingle | PackedInt
+ return 0x2; // PackedSingle
+ case X86::VMOVSDZrr:
+ // Only convert to BLEND if we are VEX compatible.
+ if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+ RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+ RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+ return 0;
+ [[fallthrough]];
+ case X86::MOVSDrr:
+ case X86::VMOVSDrr:
+ if (Subtarget.hasSSE41())
+ return 0x2 | 0x4 | 0x8; // PackedSingle | PackedDouble | PackedInt
+ return 0x4; // PackedDouble
case X86::VPANDDZ128rr:
case X86::VPANDDZ128rm:
case X86::VPANDDZ256rr:
@@ -9213,6 +9237,39 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
case X86::VPBLENDWYrmi:
case X86::VPBLENDWYrri:
return SetBlendDomain(16, true);
+ case X86::MOVSSrr:
+ case X86::VMOVSSrr:
+ case X86::VMOVSSZrr:
+ if (Domain == 3) { // PackedInt
+ MI.setDesc(
+ get(Opcode == X86::MOVSSrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
+ MI.addOperand(MachineOperand::CreateImm(0x03));
+ if (Opcode == X86::VMOVSSZrr)
+ MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+ return true;
+ }
+ return Domain == 1; // PackedSingle
+ case X86::MOVSDrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSDZrr:
+ if (Domain == 1) { // PackedSingle
+ MI.setDesc(
+ get(Opcode == X86::MOVSDrr ? X86::BLENDPSrri : X86::VBLENDPSrri));
+ MI.addOperand(MachineOperand::CreateImm(0x03));
+ if (Opcode == X86::VMOVSDZrr)
+ MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+ return true;
+ } else if (Domain == 2) { // PackedDouble
+ return true;
+ } else if (Domain == 3) { // PackedInt
+ MI.setDesc(
+ get(Opcode == X86::MOVSDrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
+ MI.addOperand(MachineOperand::CreateImm(0x0F));
+ if (Opcode == X86::VMOVSDZrr)
+ MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+ return true;
+ }
+ return false;
case X86::VPANDDZ128rr:
case X86::VPANDDZ128rm:
case X86::VPANDDZ256rr:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 1acc0cd8da205..cbff8ffd4d761 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -209,10 +209,8 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
}
multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
- X86MemOperand x86memop, string OpcodeStr,
- Domain d, Predicate pred> {
+ X86MemOperand x86memop, string OpcodeStr, Domain d> {
// AVX
- let Predicates = [UseAVX, OptForSize] in
defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
VEX, VVVV, VEX_LIG, WIG;
@@ -223,7 +221,6 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
// SSE1 & 2
let Constraints = "$src1 = $dst" in {
- let Predicates = [pred, NoSSE41_Or_OptForSize] in
defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
"\t{$src2, $dst|$dst, $src2}", d>;
}
@@ -268,9 +265,9 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
}
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
- SSEPackedSingle, UseSSE1>, TB, XS;
+ SSEPackedSingle>, TB, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
- SSEPackedDouble, UseSSE2>, TB, XD;
+ SSEPackedDouble>, TB, XD;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
@@ -292,9 +289,7 @@ let Predicates = [UseAVX] in {
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
-}
-let Predicates = [UseAVX, OptForSize] in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
@@ -313,22 +308,21 @@ let Predicates = [UseAVX, OptForSize] in {
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
}
-let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
-// Move scalar to XMM zero-extended, zeroing a VR128 then do a
-// MOVSS to the lower bits.
-def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
-def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
-}
-
let Predicates = [UseSSE2] in
def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
(MOVSDrm addr:$src)>;
-let Predicates = [UseSSE1] in
-def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
- (MOVSSrm addr:$src)>;
+let Predicates = [UseSSE1] in {
+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (MOVSSrm addr:$src)>;
+
+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+ // MOVSS to the lower bits.
+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
+}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
@@ -6382,61 +6376,25 @@ let Predicates = [HasAVX] in {
(VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
}
-// Prefer a movss or movsd over a blendps when optimizing for size. these were
-// changed to use blends because blends have better throughput on sandybridge
-// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [HasAVX, OptForSpeed] in {
- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-
- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
- (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+// TODO: Remove these and let foldMemoryOperandCustom handle it?
+let Predicates = [HasAVX] in {
def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
- (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
-
- // Move low f32 and clear high bits.
- def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
- (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
- (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
- (i8 3))), sub_xmm)>;
}
-// Prefer a movss or movsd over a blendps when optimizing for size. these were
-// changed to use blends because blends have better throughput on sandybridge
-// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseSSE41, OptForSpeed] in {
- // With SSE41 we can use blends for these patterns.
- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-
- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
- (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+let Predicates = [UseSSE41] in {
def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
- (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index f0203b3b889e4..87ea43f87b2f2 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -298,11 +298,17 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_blendpd:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
-; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[1]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_sse41_blendpd:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX-NEXT: # xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_sse41_blendpd:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX512VL-NEXT: # xmm0 = xmm0[0],xmm1[1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
index a2af7df44010e..361dccf741aee 100644
--- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
; NOAVX512MOVZXC: # %bb.0:
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
+; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 3aa77c3955c63..1608c421ed548 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -345,7 +345,6 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
; AVX512VLVNNI-NEXT: addl %edx, %eax
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 456e6e8f263aa..c32d674a84435 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -48,7 +48,6 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
; AVX512VLVNNI-NEXT: addl %edi, %eax
@@ -130,10 +129,9 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
-; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
+; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm2, %xmm1
+; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
; AVX512VLVNNI-NEXT: addl %edi, %eax
; AVX512VLVNNI-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 2e2e78a6da51e..1fca9b78352ec 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -187,12 +187,19 @@ define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpge_ss:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
-; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpge_ss:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
+; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpge_ss:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %res
@@ -229,12 +236,19 @@ define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpgt_ss:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
-; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpgt_ss:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
+; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpgt_ss:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %res
@@ -379,12 +393,19 @@ define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpnge_ss:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
-; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpnge_ss:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
+; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpnge_ss:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %res
@@ -421,12 +442,19 @@ define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpngt_ss:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
-; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpngt_ss:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
+; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpngt_ss:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
%res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %res
@@ -1601,11 +1629,17 @@ define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_move_ss:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_move_ss:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_move_ss:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %res
}
@@ -2227,8 +2261,8 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
; X86-AVX512: # %bb.0:
; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-AVX512-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
+; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; X86-AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; X86-AVX512-NEXT: retl # encoding: [0xc3]
;
@@ -2240,12 +2274,19 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
; X64-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
-; X64-AVX-LABEL: test_mm_set_ss:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X64-AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
-; X64-AVX-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
-; X64-AVX-NEXT: retq # encoding: [0xc3]
+; X64-AVX1-LABEL: test_mm_set_ss:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
+; X64-AVX1-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
+; X64-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-AVX1-NEXT: retq # encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_mm_set_ss:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; X64-AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
+; X64-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-AVX512-NEXT: retq # encoding: [0xc3]
%res0 = insertelement <4 x float> undef, float %a0, i32 0
%res1 = insertelement <4 x float> %res0, float 0.0, i32 1
%res2 = insertelement <4 x float> %res1, float 0.0, i32 2
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 3f48b22e2b9ff..79adbb5a54248 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -628,12 +628,19 @@ define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwi
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpge_sd:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
-; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpge_sd:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
+; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpge_sd:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
%ext0 = extractelement <2 x double> %cmp, i32 0
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -745,12 +752,19 @@ define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwi
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpgt_sd:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
-; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpgt_sd:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
+; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpgt_sd:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
%ext0 = extractelement <2 x double> %cmp, i32 0
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -973,12 +987,19 @@ define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounw
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpnge_sd:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
-; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpnge_sd:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
+; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpnge_sd:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
%ext0 = extractelement <2 x double> %cmp, i32 0
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -1018,12 +1039,19 @@ define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounw
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_cmpngt_sd:
-; AVX: # %bb.0:
-; AVX-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
-; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpngt_sd:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
+; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpngt_sd:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
%ext0 = extractelement <2 x double> %cmp, i32 0
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -3008,11 +3036,17 @@ define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwin
; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; AVX-LABEL: test_mm_move_sd:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_move_sd:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_move_sd:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%ext0 = extractelement <2 x double> %a1, i32 0
%res0 = insertelement <2 x double> undef, double %ext0, i32 0
%ext1 = extractelement <2 x double> %a0, i32 1
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 413b4e79257a0..423e298b11faa 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -734,7 +734,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
; X86-AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
; X86-AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
@@ -761,7 +761,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
; X64-AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
; X64-AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%a1 = load <4 x float>, ptr %p1
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index 137606b7cfeed..0d360ba7c005e 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -16,11 +16,17 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1)
; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3]
; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
-; AVX-LABEL: test_x86_sse41_blendpd:
-; AVX: ## %bb.0:
-; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0]
-; AVX-NEXT: ## xmm0 = xmm0[0],xmm1[1]
-; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse41_blendpd:
+; AVX1: ## %bb.0:
+; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse41_blendpd:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 4f5b7ee0eaea0..3c6d220bc0ffa 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -3,8 +3,8 @@
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X64-SSE
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X64-AVX512
@g16 = external global i16
@@ -361,7 +361,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
@@ -371,11 +371,17 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; X64-SSE-NEXT: retq ## encoding: [0xc3]
;
-; X64-AVX-LABEL: blendps_not_insertps_1:
-; X64-AVX: ## %bb.0:
-; X64-AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
-; X64-AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-AVX-NEXT: retq ## encoding: [0xc3]
+; X64-AVX1-LABEL: blendps_not_insertps_1:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-AVX1-NEXT: retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: blendps_not_insertps_1:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
ret <4 x float> %tmp1
}
@@ -438,11 +444,17 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
-; AVX-LABEL: blendps_not_insertps_2:
-; AVX: ## %bb.0:
-; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+; AVX1-LABEL: blendps_not_insertps_2:
+; AVX1: ## %bb.0:
+; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: blendps_not_insertps_2:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%tmp2 = extractelement <4 x float> %t2, i32 0
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
ret <4 x float> %tmp1
@@ -1217,8 +1229,8 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
;
; AVX512-LABEL: i32_shuf_X00A:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
+; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x10,0xc0]
; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
index e73d345d0fcd4..23cf271c2bb8f 100644
--- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X86_AVX,X86_AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X64_AVX,X64_AVX1
-; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X86_AVX,X86_AVX512
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X64_AVX,X64_AVX512
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X86_AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X64_AVX
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X86_AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X64_AVX
define i16 @test1(float %f) nounwind {
; X86-LABEL: test1:
@@ -32,57 +32,30 @@ define i16 @test1(float %f) nounwind {
; X64-NEXT: ## kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
-; X86_AVX1-LABEL: test1:
-; X86_AVX1: ## %bb.0:
-; X86_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86_AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X86_AVX1-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0
-; X86_AVX1-NEXT: vcvttss2si %xmm0, %eax
-; X86_AVX1-NEXT: ## kill: def $ax killed $ax killed $eax
-; X86_AVX1-NEXT: retl
-;
-; X64_AVX1-LABEL: test1:
-; X64_AVX1: ## %bb.0:
-; X64_AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64_AVX1-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0
-; X64_AVX1-NEXT: vcvttss2si %xmm0, %eax
-; X64_AVX1-NEXT: ## kill: def $ax killed $ax killed $eax
-; X64_AVX1-NEXT: retq
-;
-; X86_AVX512-LABEL: test1:
-; X86_AVX512: ## %bb.0:
-; X86_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86_AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X86_AVX512-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
-; X86_AVX512-NEXT: vcvttss2si %xmm0, %eax
-; X86_AVX512-NEXT: ## kill: def $ax killed $ax killed $eax
-; X86_AVX512-NEXT: retl
+; X86_AVX-LABEL: test1:
+; X86_AVX: ## %bb.0:
+; X86_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86_AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86_AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86_AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; X86_AVX-NEXT: vcvttss2si %xmm0, %eax
+; X86_AVX-NEXT: ## kill: def $ax killed $ax killed $eax
+; X86_AVX-NEXT: retl
;
-; X64_AVX512-LABEL: test1:
-; X64_AVX512: ## %bb.0:
-; X64_AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64_AVX512-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
-; X64_AVX512-NEXT: vcvttss2si %xmm0, %eax
-; X64_AVX512-NEXT: ## kill: def $ax killed $ax killed $eax
-; X64_AVX512-NEXT: retq
+; X64_AVX-LABEL: test1:
+; X64_AVX: ## %bb.0:
+; X64_AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64_AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64_AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; X64_AVX-NEXT: vcvttss2si %xmm0, %eax
+; X64_AVX-NEXT: ## kill: def $ax killed $ax killed $eax
+; X64_AVX-NEXT: retq
%tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
%tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 01159d4135d8e..00b60893fc783 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3272,7 +3272,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: addq $72, %rsp
; AVX512-NEXT: retq
@@ -3404,7 +3404,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; AVX512-NEXT: addq $72, %rsp
; AVX512-NEXT: retq
@@ -4107,9 +4107,9 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
-; AVX512-NEXT: vmovdqa %xmm0, (%rbx)
+; AVX512-NEXT: vmovaps %xmm0, (%rbx)
; AVX512-NEXT: addq $64, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 983ae594e3ab1..73537b4c0db76 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -113,7 +113,6 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index d99b200385585..890246467ef86 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -240,7 +240,6 @@ define i32 @test_v4i32(<4 x i8> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index aed4e023e340c..437df521c3117 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -1047,7 +1047,6 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
More information about the llvm-commits
mailing list