[llvm] [WIP][RFC][X86] Remove MOVSS/D -> BLENDPS/D conversions from DAG/ISEL (PR #144338)

Mon Jun 16 06:01:01 PDT 2025

https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/144338

>From 86e31070237e9cb219336795ebcb2026cd51736e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 16 Jun 2025 12:39:01 +0100
Subject: [PATCH] [X86] Remove MOVSS/D -> BLENDPS/D conversions from DAG/ISEL

This patch attempts to remove most of the MOVSS/D vs BLENDPS/D OptForSize/OptForSpeed instruction selection as possible and lets later domain switching and X86FixupInstTuning passes handle it.

(V)MOVSS/D instructions are created in all cases, which also avoids AVX512 getting stuck with VBLENDPS/D VEC inctructions restricting register usage.

getExecutionDomainCustom can now convert MOVSS/D to BLENDW/PS to support domain switches and X86FixupInstTuning can convert VMOVSS/D back to VBLENDPS/D if the scheduler model prefers it (and not building for OptSize).

Fixes #142972
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    |  28 +++++
 llvm/lib/Target/X86/X86InstrAVX512.td         |  20 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp          |  57 +++++++++
 llvm/lib/Target/X86/X86InstrSSE.td            |  76 +++---------
 .../CodeGen/X86/avx-intrinsics-x86-upgrade.ll |  16 ++-
 .../test/CodeGen/X86/avx512copy-intrinsics.ll |   2 +-
 llvm/test/CodeGen/X86/dpbusd.ll               |   1 -
 llvm/test/CodeGen/X86/dpbusd_const.ll         |   8 +-
 .../CodeGen/X86/sse-intrinsics-fast-isel.ll   | 115 ++++++++++++------
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |  92 +++++++++-----
 .../X86/sse2-intrinsics-x86-upgrade.ll        |   4 +-
 .../X86/sse41-intrinsics-x86-upgrade.ll       |  16 ++-
 llvm/test/CodeGen/X86/sse41.ll                |  42 ++++---
 llvm/test/CodeGen/X86/vec_ss_load_fold.ll     |  81 ++++--------
 .../CodeGen/X86/vector-half-conversions.ll    |   8 +-
 .../CodeGen/X86/vector-reduce-add-mask.ll     |   1 -
 .../CodeGen/X86/vector-reduce-add-zext.ll     |   1 -
 llvm/test/CodeGen/X86/vector-reduce-add.ll    |   1 -
 18 files changed, 331 insertions(+), 238 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 33dc0a232815c..099a7147d81a1 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -54,6 +54,7 @@ class X86FixupInstTuningPass : public MachineFunctionPass {
 
 private:
   const X86InstrInfo *TII = nullptr;
+  const X86RegisterInfo *TRI = nullptr;
   const X86Subtarget *ST = nullptr;
   const MCSchedModel *SM = nullptr;
 };
@@ -277,6 +278,18 @@ bool X86FixupInstTuningPass::processInstruction(
     return true;
   };
 
+  auto ProcessMOVToBLEND = [&](unsigned BlendOpc, unsigned BlendImm) -> bool {
+    if (OptSize || !NewOpcPreferable(BlendOpc, /*ReplaceInTie*/ false))
+      return false;
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(BlendOpc));
+      MI.addOperand(MachineOperand::CreateImm(BlendImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
+    return true;
+  };
+
   switch (Opc) {
   case X86::BLENDPDrri:
     return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -296,6 +309,20 @@ bool X86FixupInstTuningPass::processInstruction(
     // TODO: Add X86::VPBLENDWYrmi handling
     return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
 
+  case X86::VMOVSSZrr:
+    if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return false;
+    return ProcessMOVToBLEND(X86::VBLENDPSrri, 0x01);
+
+  case X86::VMOVSDZrr:
+    if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return false;
+    return ProcessMOVToBLEND(X86::VBLENDPDrri, 0x01);
+
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
@@ -573,6 +600,7 @@ bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   ST = &MF.getSubtarget<X86Subtarget>();
   TII = ST->getInstrInfo();
+  TRI = ST->getRegisterInfo();
   SM = &ST->getSchedModel();
 
   for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0ab94cca41425..d369a2d8e9f68 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3904,13 +3904,12 @@ def : Pat<(f64 (bitconvert VK64:$src)),
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
                               X86VectorVTInfo _, Predicate prd = HasAVX512> {
-  let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
+  let Predicates = [prd] in {
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
              _.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
-  let Predicates = [prd] in {
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -4394,7 +4393,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                 (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
                                  VR128X:$src1, VR128X:$src2), 0>;
 
-let Predicates = [HasAVX512, OptForSize] in {
+let Predicates = [HasAVX512] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
             (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
@@ -4420,21 +4419,6 @@ let Predicates = [HasAVX512, OptForSize] in {
               (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
 }
 
-// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
-// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
-let Predicates = [HasAVX512, OptForSpeed] in {
-  def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
-                          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
-                          (i8 1))), sub_xmm)>;
-  def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
-                          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
-                          (i8 3))), sub_xmm)>;
-}
-
 let Predicates = [HasAVX512] in {
   def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
             (VMOVSSZrm addr:$src)>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365eedec39..d97f424f808e5 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -9073,6 +9073,30 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
   case X86::VPBLENDWYrmi:
   case X86::VPBLENDWYrri:
     return GetBlendDomains(8, false);
+  case X86::VMOVSSZrr:
+    // Only convert to BLEND if we are VEX compatible.
+    if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return 0;
+    [[fallthrough]];
+  case X86::MOVSSrr:
+  case X86::VMOVSSrr:
+    if (Subtarget.hasSSE41())
+      return 0x2 | 0x8; // PackedSingle | PackedInt
+    return 0x2;         // PackedSingle
+  case X86::VMOVSDZrr:
+    // Only convert to BLEND if we are VEX compatible.
+    if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return 0;
+    [[fallthrough]];
+  case X86::MOVSDrr:
+  case X86::VMOVSDrr:
+    if (Subtarget.hasSSE41())
+      return 0x2 | 0x4 | 0x8; // PackedSingle | PackedDouble | PackedInt
+    return 0x4;               // PackedDouble
   case X86::VPANDDZ128rr:
   case X86::VPANDDZ128rm:
   case X86::VPANDDZ256rr:
@@ -9213,6 +9237,39 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
   case X86::VPBLENDWYrmi:
   case X86::VPBLENDWYrri:
     return SetBlendDomain(16, true);
+  case X86::MOVSSrr:
+  case X86::VMOVSSrr:
+  case X86::VMOVSSZrr:
+    if (Domain == 3) { // PackedInt
+      MI.setDesc(
+          get(Opcode == X86::MOVSSrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
+      MI.addOperand(MachineOperand::CreateImm(0x03));
+      if (Opcode == X86::VMOVSSZrr)
+        MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+      return true;
+    }
+    return Domain == 1; // PackedSingle
+  case X86::MOVSDrr:
+  case X86::VMOVSDrr:
+  case X86::VMOVSDZrr:
+    if (Domain == 1) { // PackedSingle
+      MI.setDesc(
+          get(Opcode == X86::MOVSDrr ? X86::BLENDPSrri : X86::VBLENDPSrri));
+      MI.addOperand(MachineOperand::CreateImm(0x03));
+      if (Opcode == X86::VMOVSDZrr)
+        MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+      return true;
+    } else if (Domain == 2) { // PackedDouble
+      return true;
+    } else if (Domain == 3) { // PackedInt
+      MI.setDesc(
+          get(Opcode == X86::MOVSDrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
+      MI.addOperand(MachineOperand::CreateImm(0x0F));
+      if (Opcode == X86::VMOVSDZrr)
+        MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+      return true;
+    }
+    return false;
   case X86::VPANDDZ128rr:
   case X86::VPANDDZ128rm:
   case X86::VPANDDZ256rr:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 1acc0cd8da205..cbff8ffd4d761 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -209,10 +209,8 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
 }
 
 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
-                      X86MemOperand x86memop, string OpcodeStr,
-                      Domain d, Predicate pred> {
+                      X86MemOperand x86memop, string OpcodeStr, Domain d> {
   // AVX
-  let Predicates = [UseAVX, OptForSize] in
   defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
                               VEX, VVVV, VEX_LIG, WIG;
@@ -223,7 +221,6 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
                      VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
-    let Predicates = [pred, NoSSE41_Or_OptForSize] in
     defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $dst|$dst, $src2}", d>;
   }
@@ -268,9 +265,9 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
-                        SSEPackedSingle, UseSSE1>, TB, XS;
+                        SSEPackedSingle>, TB, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
-                        SSEPackedDouble, UseSSE2>, TB, XD;
+                        SSEPackedDouble>, TB, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
   defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
@@ -292,9 +289,7 @@ let Predicates = [UseAVX] in {
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzload64 addr:$src)),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
-}
 
-let Predicates = [UseAVX, OptForSize] in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
@@ -313,22 +308,21 @@ let Predicates = [UseAVX, OptForSize] in {
               (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
 }
 
-let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
-// Move scalar to XMM zero-extended, zeroing a VR128 then do a
-// MOVSS to the lower bits.
-def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
-def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
-}
-
 let Predicates = [UseSSE2] in
 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
           (MOVSDrm addr:$src)>;
 
-let Predicates = [UseSSE1] in
-def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-          (MOVSSrm addr:$src)>;
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (MOVSSrm addr:$src)>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
@@ -6382,61 +6376,25 @@ let Predicates = [HasAVX] in {
             (VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
 }
 
-// Prefer a movss or movsd over a blendps when optimizing for size. these were
-// changed to use blends because blends have better throughput on sandybridge
-// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [HasAVX, OptForSpeed] in {
-  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
-  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-
-  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
-            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+// TODO: Remove these and let foldMemoryOperandCustom handle it?
+let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
             (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
             (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
 
-  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
             (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
             (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
-
-  // Move low f32 and clear high bits.
-  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
-                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
-                          (i8 1))), sub_xmm)>;
-  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
-                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
-                          (i8 3))), sub_xmm)>;
 }
 
-// Prefer a movss or movsd over a blendps when optimizing for size. these were
-// changed to use blends because blends have better throughput on sandybridge
-// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseSSE41, OptForSpeed] in {
-  // With SSE41 we can use blends for these patterns.
-  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
-  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-
-  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
-            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+let Predicates = [UseSSE41] in {
   def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
             (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
             (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
 
-  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
             (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index f0203b3b889e4..87ea43f87b2f2 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -298,11 +298,17 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
 
 
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_blendpd:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
-; CHECK-NEXT:    # xmm0 = xmm0[0],xmm1[1]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_sse41_blendpd:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_sse41_blendpd:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],xmm1[1]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
index a2af7df44010e..361dccf741aee 100644
--- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
 ; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
 ; NOAVX512MOVZXC:       # %bb.0:
 ; NOAVX512MOVZXC-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; NOAVX512MOVZXC-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
+; NOAVX512MOVZXC-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
 ; NOAVX512MOVZXC-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOAVX512MOVZXC-NEXT:    retq # encoding: [0xc3]
   %res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 3aa77c3955c63..1608c421ed548 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -345,7 +345,6 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
 ; AVX512VLVNNI-NEXT:    addl %edx, %eax
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 456e6e8f263aa..c32d674a84435 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -48,7 +48,6 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX512VLVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLVNNI-NEXT:    addl %edi, %eax
@@ -130,10 +129,9 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
-; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
+; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm2, %xmm1
+; AVX512VLVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLVNNI-NEXT:    addl %edi, %eax
 ; AVX512VLVNNI-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 2e2e78a6da51e..1fca9b78352ec 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -187,12 +187,19 @@ define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpge_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
-; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpge_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
+; AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpge_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -229,12 +236,19 @@ define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpgt_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
-; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpgt_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
+; AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpgt_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -379,12 +393,19 @@ define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpnge_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
-; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpnge_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
+; AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpnge_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -421,12 +442,19 @@ define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpngt_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
-; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpngt_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
+; AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpngt_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -1601,11 +1629,17 @@ define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_move_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_move_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_move_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
@@ -2227,8 +2261,8 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
+; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; X86-AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2240,12 +2274,19 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
-; X64-AVX-LABEL: test_mm_set_ss:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X64-AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
-; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
-; X64-AVX-NEXT:    retq # encoding: [0xc3]
+; X64-AVX1-LABEL: test_mm_set_ss:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
+; X64-AVX1-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
+; X64-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_mm_set_ss:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; X64-AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
+; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
   %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 3f48b22e2b9ff..79adbb5a54248 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -628,12 +628,19 @@ define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwi
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpge_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
-; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpge_sd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
+; AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpge_sd:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
   %ext0 = extractelement <2 x double> %cmp, i32 0
   %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -745,12 +752,19 @@ define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwi
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpgt_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
-; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpgt_sd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
+; AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpgt_sd:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
   %ext0 = extractelement <2 x double> %cmp, i32 0
   %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -973,12 +987,19 @@ define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounw
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpnge_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
-; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpnge_sd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
+; AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpnge_sd:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
   %ext0 = extractelement <2 x double> %cmp, i32 0
   %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -1018,12 +1039,19 @@ define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounw
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_cmpngt_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
-; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_cmpngt_sd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
+; AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_cmpngt_sd:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
   %ext0 = extractelement <2 x double> %cmp, i32 0
   %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -3008,11 +3036,17 @@ define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwin
 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX-LABEL: test_mm_move_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
-; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX1-LABEL: test_mm_move_sd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512-LABEL: test_mm_move_sd:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ext0 = extractelement <2 x double> %a1, i32 0
   %res0 = insertelement <2 x double> undef, double %ext0, i32 0
   %ext1 = extractelement <2 x double> %a0, i32 1
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 413b4e79257a0..423e298b11faa 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -734,7 +734,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -761,7 +761,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
 ; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %a1 = load <4 x float>, ptr %p1
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index 137606b7cfeed..0d360ba7c005e 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -16,11 +16,17 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1)
 ; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3]
 ; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX-LABEL: test_x86_sse41_blendpd:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0]
-; AVX-NEXT:    ## xmm0 = xmm0[0],xmm1[1]
-; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse41_blendpd:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse41_blendpd:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 4f5b7ee0eaea0..3c6d220bc0ffa 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -3,8 +3,8 @@
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X64-SSE
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X64-AVX512
 
 @g16 = external global i16
 
@@ -361,7 +361,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -371,11 +371,17 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
-; X64-AVX-LABEL: blendps_not_insertps_1:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
-; X64-AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
+; X64-AVX1-LABEL: blendps_not_insertps_1:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: blendps_not_insertps_1:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
   ret <4 x float> %tmp1
 }
@@ -438,11 +444,17 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
 ; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX-LABEL: blendps_not_insertps_2:
-; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
-; AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; AVX1-LABEL: blendps_not_insertps_2:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: blendps_not_insertps_2:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %tmp2 = extractelement <4 x float> %t2, i32 0
   %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
   ret <4 x float> %tmp1
@@ -1217,8 +1229,8 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 ;
 ; AVX512-LABEL: i32_shuf_X00A:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x10,0xc0]
 ; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
 ; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
index e73d345d0fcd4..23cf271c2bb8f 100644
--- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X86_AVX,X86_AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X64_AVX,X64_AVX1
-; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X86_AVX,X86_AVX512
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X64_AVX,X64_AVX512
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X86_AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X64_AVX
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X86_AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X64_AVX
 
 define i16 @test1(float %f) nounwind {
 ; X86-LABEL: test1:
@@ -32,57 +32,30 @@ define i16 @test1(float %f) nounwind {
 ; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
-; X86_AVX1-LABEL: test1:
-; X86_AVX1:       ## %bb.0:
-; X86_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86_AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X86_AVX1-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
-; X86_AVX1-NEXT:    vcvttss2si %xmm0, %eax
-; X86_AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
-; X86_AVX1-NEXT:    retl
-;
-; X64_AVX1-LABEL: test1:
-; X64_AVX1:       ## %bb.0:
-; X64_AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64_AVX1-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
-; X64_AVX1-NEXT:    vcvttss2si %xmm0, %eax
-; X64_AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
-; X64_AVX1-NEXT:    retq
-;
-; X86_AVX512-LABEL: test1:
-; X86_AVX512:       ## %bb.0:
-; X86_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86_AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X86_AVX512-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
-; X86_AVX512-NEXT:    vcvttss2si %xmm0, %eax
-; X86_AVX512-NEXT:    ## kill: def $ax killed $ax killed $eax
-; X86_AVX512-NEXT:    retl
+; X86_AVX-LABEL: test1:
+; X86_AVX:       ## %bb.0:
+; X86_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86_AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86_AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86_AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86_AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; X86_AVX-NEXT:    vcvttss2si %xmm0, %eax
+; X86_AVX-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86_AVX-NEXT:    retl
 ;
-; X64_AVX512-LABEL: test1:
-; X64_AVX512:       ## %bb.0:
-; X64_AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64_AVX512-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
-; X64_AVX512-NEXT:    vcvttss2si %xmm0, %eax
-; X64_AVX512-NEXT:    ## kill: def $ax killed $ax killed $eax
-; X64_AVX512-NEXT:    retq
+; X64_AVX-LABEL: test1:
+; X64_AVX:       ## %bb.0:
+; X64_AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64_AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64_AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64_AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; X64_AVX-NEXT:    vcvttss2si %xmm0, %eax
+; X64_AVX-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X64_AVX-NEXT:    retq
   %tmp = insertelement <4 x float> undef, float %f, i32 0		; <<4 x float>> [#uses=1]
   %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
   %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 01159d4135d8e..00b60893fc783 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3272,7 +3272,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT:    vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
 ; AVX512-NEXT:    addq $72, %rsp
 ; AVX512-NEXT:    retq
@@ -3404,7 +3404,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT:    vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
 ; AVX512-NEXT:    addq $72, %rsp
 ; AVX512-NEXT:    retq
@@ -4107,9 +4107,9 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
 ; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2 at PLT
 ; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT:    vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
-; AVX512-NEXT:    vmovdqa %xmm0, (%rbx)
+; AVX512-NEXT:    vmovaps %xmm0, (%rbx)
 ; AVX512-NEXT:    addq $64, %rsp
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 983ae594e3ab1..73537b4c0db76 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -113,7 +113,6 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index d99b200385585..890246467ef86 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -240,7 +240,6 @@ define i32 @test_v4i32(<4 x i8> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index aed4e023e340c..437df521c3117 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -1047,7 +1047,6 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax