[llvm] [CodeGen][Spill2Reg] Initial patch (PR #118832)

Wed Dec 25 07:59:58 PST 2024

https://github.com/vporpo updated https://github.com/llvm/llvm-project/pull/118832

>From ede86a24008c06372dd5438c0f71e17396273cab Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Thu, 5 Dec 2024 08:29:18 -0800
Subject: [PATCH 1/9] [CodeGen][Spill2Reg] Initial patch

This is the first commit for the Spill2Reg optimization pass.
The goal of this pass is to selectively replace spills to the stack with
spills to vector registers. This can help remove back-end stalls in x86.

Old code review: https://reviews.llvm.org/D118298

RFC:
https://lists.llvm.org/pipermail/llvm-dev/2022-January/154782.html
https://discourse.llvm.org/t/rfc-spill2reg-selectively-replace-spills-to-stack-with-spills-to-vector-registers/59630
---
 llvm/include/llvm/CodeGen/Passes.h    |  3 ++
 llvm/include/llvm/InitializePasses.h  |  1 +
 llvm/lib/CodeGen/CMakeLists.txt       |  1 +
 llvm/lib/CodeGen/CodeGen.cpp          |  1 +
 llvm/lib/CodeGen/Spill2Reg.cpp        | 56 +++++++++++++++++++++++++++
 llvm/lib/CodeGen/TargetPassConfig.cpp |  9 +++++
 6 files changed, 71 insertions(+)
 create mode 100644 llvm/lib/CodeGen/Spill2Reg.cpp

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index d1fac4a304cffe..77d305aa7d0a9c 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -608,6 +608,9 @@ namespace llvm {
 
   /// Lowers KCFI operand bundles for indirect calls.
   FunctionPass *createKCFIPass();
+
+  /// This pass replaces spills to stack with spills to registers.
+  extern char &Spill2RegID;
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 1cb9013bc48cc5..9d427fc695f86b 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -322,6 +322,7 @@ void initializeWasmEHPreparePass(PassRegistry &);
 void initializeWinEHPreparePass(PassRegistry &);
 void initializeWriteBitcodePassPass(PassRegistry &);
 void initializeXRayInstrumentationPass(PassRegistry &);
+void initializeSpill2RegPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 145fd2fac8b564..65a60f56063543 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -219,6 +219,7 @@ add_llvm_component_library(LLVMCodeGen
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
+  Spill2Reg.cpp
   SplitKit.cpp
   StackColoring.cpp
   StackFrameLayoutAnalysisPass.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 8efe540770913a..348694c42dbef9 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -143,4 +143,5 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeWasmEHPreparePass(Registry);
   initializeWinEHPreparePass(Registry);
   initializeXRayInstrumentationPass(Registry);
+  initializeSpill2RegPass(Registry);
 }
diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
new file mode 100644
index 00000000000000..09ffa71b891cb5
--- /dev/null
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -0,0 +1,56 @@
+//===- Spill2Reg.cpp - Spill To Register Optimization ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+/// \file This file implements Spill2Reg, an optimization which selectively
+/// replaces spills/reloads to/from the stack with register copies to/from the
+/// vector register file. This works even on targets where load/stores have
+/// similar latency to register copies because it can free up memory units which
+/// helps avoid back-end stalls.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace {
+
+class Spill2Reg : public MachineFunctionPass {
+public:
+  static char ID;
+  Spill2Reg() : MachineFunctionPass(ID) {
+    initializeSpill2RegPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  bool runOnMachineFunction(MachineFunction &) override;
+};
+
+} // namespace
+
+void Spill2Reg::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void Spill2Reg::releaseMemory() {}
+
+bool Spill2Reg::runOnMachineFunction(MachineFunction &MFn) {
+  llvm_unreachable("Unimplemented");
+}
+
+char Spill2Reg::ID = 0;
+
+char &llvm::Spill2RegID = Spill2Reg::ID;
+
+INITIALIZE_PASS_BEGIN(Spill2Reg, "spill2reg", "Spill2Reg", false, false)
+INITIALIZE_PASS_END(Spill2Reg, "spill2reg", "Spill2Reg", false, false)
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index d407e9f0871d4c..87ee076db7a9f3 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -214,6 +214,11 @@ static cl::opt<bool> DisableReplaceWithVecLib(
     "disable-replace-with-vec-lib", cl::Hidden,
     cl::desc("Disable replace with vector math call pass"));
 
+// Enable the Spill2Reg pass.
+static cl::opt<bool> EnableSpill2Reg("enable-spill2reg", cl::Hidden,
+                                     cl::init(false),
+                                     cl::desc("Enable Spill2Reg pass"));
+
 /// Option names for limiting the codegen pipeline.
 /// Those are used in error reporting and we didn't want
 /// to duplicate their names all over the place.
@@ -1415,6 +1420,10 @@ bool TargetPassConfig::addRegAssignAndRewriteOptimized() {
   // Finally rewrite virtual registers.
   addPass(&VirtRegRewriterID);
 
+  // Replace spills to stack with spills to registers.
+  if (EnableSpill2Reg)
+    addPass(&Spill2RegID);
+
   // Regalloc scoring for ML-driven eviction - noop except when learning a new
   // eviction policy.
   addPass(createRegAllocScoringPass());

>From e3cb1c3728732ac5f892a670b99104c51d4e48ff Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 01:52:18 -0800
Subject: [PATCH 2/9] [Spill2Reg] This patch adds spill/reload collection

Walk through the code looking for spills and reloads and group them
per stack slot.

Original review: https://reviews.llvm.org/D118299
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h |  21 +++
 llvm/lib/CodeGen/Spill2Reg.cpp              | 156 +++++++++++++++++++-
 llvm/lib/Target/X86/X86InstrInfo.cpp        |  66 ++++++++-
 llvm/lib/Target/X86/X86InstrInfo.h          |  19 +++
 4 files changed, 256 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 408adcd330b846..1e3922f5439b4e 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -294,6 +294,11 @@ class TargetInstrInfo : public MCInstrInfo {
     return isLoadFromStackSlot(MI, FrameIndex);
   }
 
+  virtual const MachineOperand *isLoadFromStackSlotMO(const MachineInstr &MI,
+                                                      int &FrameIndex) const {
+    llvm_unreachable("target did not implement");
+  }
+
   /// Check for post-frame ptr elimination stack locations as well.
   /// This uses a heuristic so it isn't reliable for correctness.
   virtual Register isLoadFromStackSlotPostFE(const MachineInstr &MI,
@@ -321,6 +326,11 @@ class TargetInstrInfo : public MCInstrInfo {
     return 0;
   }
 
+  virtual const MachineOperand *isStoreToStackSlotMO(const MachineInstr &MI,
+                                                     int &FrameIndex) const {
+    llvm_unreachable("target did not implement");
+  }
+
   /// Optional extension of isStoreToStackSlot that returns the number of
   /// bytes stored to the stack. This must be implemented if a backend
   /// supports partial stack slot spills/loads to further disambiguate
@@ -2284,6 +2294,17 @@ class TargetInstrInfo : public MCInstrInfo {
     llvm_unreachable("unknown number of operands necessary");
   }
 
+  /// \Returns true if a spill/reload of \p Reg can be handled by Spill2Reg.
+  virtual bool isLegalToSpill2Reg(Register Reg, const TargetRegisterInfo *TRI,
+                                  const MachineRegisterInfo *MRI) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::isLegalToSpill2Reg!");
+  }
+
+  virtual bool targetSupportsSpill2Reg(const TargetSubtargetInfo *STI) const {
+    return false;
+  }
+
 private:
   mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
index 09ffa71b891cb5..f0a7a0ebdaffb1 100644
--- a/llvm/lib/CodeGen/Spill2Reg.cpp
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -15,10 +15,15 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -33,6 +38,56 @@ class Spill2Reg : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   void releaseMemory() override;
   bool runOnMachineFunction(MachineFunction &) override;
+
+private:
+  /// Holds data for spills and reloads.
+  struct StackSlotDataEntry {
+    /// This is set to true to disable code generation for the spills/reloads
+    /// that we collected in this entry.
+    bool Disable = false;
+    /// Indentation for the dump() methods.
+    static constexpr const int DumpInd = 2;
+
+    /// The data held for each spill/reload.
+    struct MIData {
+      MIData(MachineInstr *MI, const MachineOperand *MO, unsigned SpillBits)
+          : MI(MI), MO(MO), SpillBits(SpillBits) {}
+      /// The Spill/Reload instruction.
+      MachineInstr *MI = nullptr;
+      /// The operand being spilled/reloaded.
+      const MachineOperand *MO = nullptr;
+      /// The size of the data spilled/reloaded in bits. This occasionally
+      /// differs across accesses to the same stack slot.
+      unsigned SpillBits = 0;
+#ifndef NDEBUG
+      LLVM_DUMP_METHOD void dump() const;
+#endif
+    };
+    SmallVector<MIData, 1> Spills;
+    SmallVector<MIData, 1> Reloads;
+
+#ifndef NDEBUG
+    LLVM_DUMP_METHOD void dump() const;
+#endif
+  };
+  /// Look for candidates for spill2reg. These candidates are in places with
+  /// high memory unit contention. Fills in StackSlotData.
+  void collectSpillsAndReloads();
+  /// Replace spills to stack with spills to registers (same for reloads).
+  void generateCode();
+  /// Cleanup data structures once the pass is finished.
+  void cleanup();
+  /// The main entry point for this pass.
+  bool run();
+
+  /// Map from a stack slot to the corresponding spills and reloads.
+  DenseMap<int, StackSlotDataEntry> StackSlotData;
+
+  MachineFunction *MF = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  MachineFrameInfo *MFI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 };
 
 } // namespace
@@ -45,12 +100,111 @@ void Spill2Reg::getAnalysisUsage(AnalysisUsage &AU) const {
 void Spill2Reg::releaseMemory() {}
 
 bool Spill2Reg::runOnMachineFunction(MachineFunction &MFn) {
-  llvm_unreachable("Unimplemented");
+  // Disable if NoImplicitFloat to avoid emitting instrs that use vectors.
+  if (MFn.getFunction().hasFnAttribute(Attribute::NoImplicitFloat))
+    return false;
+
+  MF = &MFn;
+  MRI = &MF->getRegInfo();
+  MFI = &MF->getFrameInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  // Enable only if the target supports the appropriate vector instruction set.
+  if (!TII->targetSupportsSpill2Reg(&MF->getSubtarget()))
+    return false;
+
+  return run();
 }
 
 char Spill2Reg::ID = 0;
 
 char &llvm::Spill2RegID = Spill2Reg::ID;
 
+void Spill2Reg::collectSpillsAndReloads() {
+  /// The checks for collecting spills and reloads are identical, so we keep
+  /// them here in one place. Return true if we should not collect this.
+  auto SkipEntry = [this](int StackSlot, Register Reg) -> bool {
+    // If not a spill/reload stack slot.
+    if (!MFI->isSpillSlotObjectIndex(StackSlot))
+      return true;
+    // Check size in bits.
+    if (!TII->isLegalToSpill2Reg(Reg, TRI, MRI))
+      return true;
+    return false;
+  };
+
+  // Collect spills and reloads and associate them to stack slots.
+  // If any spill/reload for a stack slot is found not to be eligible for
+  // spill-to-reg, then that stack slot is disabled.
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      int StackSlot;
+      if (const MachineOperand *MO = TII->isStoreToStackSlotMO(MI, StackSlot)) {
+        MachineInstr *Spill = &MI;
+        auto &Entry = StackSlotData[StackSlot];
+        if (Entry.Disable || SkipEntry(StackSlot, MO->getReg())) {
+          Entry.Disable = true;
+          continue;
+        }
+        unsigned SpillBits = TRI->getRegSizeInBits(MO->getReg(), *MRI);
+        Entry.Spills.emplace_back(Spill, MO, SpillBits);
+      } else if (const MachineOperand *MO =
+                     TII->isLoadFromStackSlotMO(MI, StackSlot)) {
+        MachineInstr *Reload = &MI;
+        auto &Entry = StackSlotData[StackSlot];
+        if (Entry.Disable || SkipEntry(StackSlot, MO->getReg())) {
+          Entry.Disable = true;
+          continue;
+        }
+        assert(Reload->getRestoreSize(TII) && "Expected reload");
+        unsigned SpillBits = TRI->getRegSizeInBits(MO->getReg(), *MRI);
+        Entry.Reloads.emplace_back(Reload, MO, SpillBits);
+      } else {
+        // This should capture uses of the stack in instructions that access
+        // memory (e.g., folded spills/reloads) and non-memory instructions,
+        // like x86 LEA.
+        for (const MachineOperand &MO : MI.operands())
+          if (MO.isFI()) {
+            int StackSlot = MO.getIndex();
+            auto &Entry = StackSlotData[StackSlot];
+            Entry.Disable = true;
+          }
+      }
+    }
+  }
+}
+
+void Spill2Reg::generateCode() { llvm_unreachable("Unimplemented"); }
+
+void Spill2Reg::cleanup() { StackSlotData.clear(); }
+
+bool Spill2Reg::run() {
+  // Walk over each instruction in the code keeping track of the processor's
+  // port pressure and look for memory unit hot-spots.
+  collectSpillsAndReloads();
+
+  // Replace each spills/reloads to stack slots with register spills/reloads.
+  generateCode();
+
+  cleanup();
+  return true;
+}
+
+#ifndef NDEBUG
+void Spill2Reg::StackSlotDataEntry::MIData::dump() const {
+  dbgs() << "  (" << *MO << ") " << *MI;
+}
+
+void Spill2Reg::StackSlotDataEntry::dump() const {
+  dbgs().indent(DumpInd) << "Disable: " << Disable << "\n";
+  dbgs().indent(DumpInd) << "Spills:\n";
+  for (const MIData &Data : Spills)
+    Data.dump();
+  dbgs().indent(DumpInd) << "Reloads:\n";
+  for (const MIData &Data : Reloads)
+    Data.dump();
+}
+#endif
+
 INITIALIZE_PASS_BEGIN(Spill2Reg, "spill2reg", "Spill2Reg", false, false)
 INITIALIZE_PASS_END(Spill2Reg, "spill2reg", "Spill2Reg", false, false)
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5a6ea1182ccb83..bf136a9a00b295 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -691,13 +691,27 @@ Register X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
 }
 
+const MachineOperand *
+X86InstrInfo::isLoadFromStackSlotMO(const MachineInstr &MI, int &FrameIndex,
+                                    unsigned &MemBytes) const {
+  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
+    if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+      return &MI.getOperand(0);
+  return nullptr;
+}
+
+const MachineOperand *
+X86InstrInfo::isLoadFromStackSlotMO(const MachineInstr &MI,
+                                    int &FrameIndex) const {
+  unsigned UnusedMemBytes;
+  return isLoadFromStackSlotMO(MI, FrameIndex, UnusedMemBytes);
+}
+
 Register X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                            int &FrameIndex,
                                            unsigned &MemBytes) const {
-  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
-    if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
-      return MI.getOperand(0).getReg();
-  return 0;
+  const MachineOperand *MO = isLoadFromStackSlotMO(MI, FrameIndex, MemBytes);
+  return MO != nullptr ? (unsigned)MO->getReg() : 0;
 }
 
 Register X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
@@ -719,10 +733,30 @@ Register X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
   return 0;
 }
 
+const MachineOperand *
+X86InstrInfo::isStoreToStackSlotMO(const MachineInstr &MI, int &FrameIndex,
+                                   unsigned &MemBytes) const {
+  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
+    if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+        isFrameOperand(MI, 0, FrameIndex))
+      return &MI.getOperand(X86::AddrNumOperands);
+  return nullptr;
+}
+
+const MachineOperand *
+X86InstrInfo::isStoreToStackSlotMO(const MachineInstr &MI,
+                                   int &FrameIndex) const {
+  unsigned UnusedMemBytes;
+  return isStoreToStackSlotMO(MI, FrameIndex, UnusedMemBytes);
+}
+
 Register X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
   unsigned Dummy;
-  return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
+  const auto *MO = X86InstrInfo::isStoreToStackSlotMO(MI, FrameIndex, Dummy);
+  if (MO != nullptr)
+    return MO->getReg();
+  return 0;
 }
 
 Register X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
@@ -10893,5 +10927,27 @@ void X86InstrInfo::getFrameIndexOperands(SmallVectorImpl<MachineOperand> &Ops,
   M.getFullAddress(Ops);
 }
 
+bool X86InstrInfo::isLegalToSpill2Reg(Register Reg,
+                                      const TargetRegisterInfo *TRI,
+                                      const MachineRegisterInfo *MRI) const {
+  // Skip instructions like `$k1 = KMOVWkm %stack.1` because replacing stack
+  // with xmm0 results in an illegal instruction `movq  %k1, %xmm0`.
+  if (X86::VK16RegClass.contains(Reg))
+    return false;
+
+  switch (unsigned Bits = TRI->getRegSizeInBits(Reg, *MRI)) {
+  case 64:
+  case 32:
+    return true;
+  }
+  return false;
+}
+
+bool X86InstrInfo::targetSupportsSpill2Reg(
+    const TargetSubtargetInfo *STI) const {
+  const X86Subtarget *X86STI = static_cast<const X86Subtarget *>(STI);
+  return X86STI->hasSSE41();
+}
+
 #define GET_INSTRINFO_HELPERS
 #include "X86GenInstrInfo.inc"
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index b006bc3971984c..000e0c579118bc 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -272,6 +272,13 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// FIXME: This should become part of our instruction tables.
   static bool isDataInvariantLoad(MachineInstr &MI);
 
+  const MachineOperand *isLoadFromStackSlotMO(const MachineInstr &MI,
+                                              int &FrameIndex,
+                                              unsigned &MemBytes) const;
+
+  const MachineOperand *isLoadFromStackSlotMO(const MachineInstr &MI,
+                                              int &FrameIndex) const override;
+
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
   Register isLoadFromStackSlot(const MachineInstr &MI,
@@ -283,6 +290,13 @@ class X86InstrInfo final : public X86GenInstrInfo {
   Register isLoadFromStackSlotPostFE(const MachineInstr &MI,
                                      int &FrameIndex) const override;
 
+  const MachineOperand *isStoreToStackSlotMO(const MachineInstr &MI,
+                                             int &FrameIndex,
+                                             unsigned &MemBytes) const;
+
+  const MachineOperand *isStoreToStackSlotMO(const MachineInstr &MI,
+                                             int &FrameIndex) const override;
+
   Register isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
   Register isStoreToStackSlot(const MachineInstr &MI,
@@ -728,6 +742,11 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// \returns the index of operand that is commuted with \p Idx1. If the method
   /// fails to commute the operands, it will return \p Idx1.
   unsigned commuteOperandsForFold(MachineInstr &MI, unsigned Idx1) const;
+
+  bool isLegalToSpill2Reg(Register Reg, const TargetRegisterInfo *TRI,
+                          const MachineRegisterInfo *MRI) const override;
+
+  bool targetSupportsSpill2Reg(const TargetSubtargetInfo *STI) const override;
 };
 } // namespace llvm
 

>From 2be3f6afc6663dbae81a7280be17acd3d7125862 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 01:55:12 -0800
Subject: [PATCH 3/9] [Spill2Reg] Code generation part 1

This patch adds the main structure of the code generation phase of Spill2Reg.
Iterate through the spills/reloads collected earlier and generate the
new instructions.

Original review: https://reviews.llvm.org/D118300
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h |  8 ++
 llvm/lib/CodeGen/Spill2Reg.cpp              | 88 ++++++++++++++++++++-
 llvm/lib/Target/X86/X86InstrInfo.cpp        |  8 ++
 llvm/lib/Target/X86/X86InstrInfo.h          |  4 +
 4 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 1e3922f5439b4e..ca99e1a31da022 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2305,6 +2305,14 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
+  virtual const TargetRegisterClass *
+  getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
+                                     Register SpilledReg) const {
+    llvm_unreachable(
+        "Target didn't implement "
+        "TargetInstrInfo::createVirtualVectorRegisterForSpillToReg!");
+  }
+
 private:
   mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
index f0a7a0ebdaffb1..76ee001921f73f 100644
--- a/llvm/lib/CodeGen/Spill2Reg.cpp
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -15,6 +15,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "AllocationOrder.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -66,6 +68,13 @@ class Spill2Reg : public MachineFunctionPass {
     SmallVector<MIData, 1> Spills;
     SmallVector<MIData, 1> Reloads;
 
+    /// \Returns the register class of the register being spilled.
+    const TargetRegisterClass *
+    getSpilledRegClass(const TargetInstrInfo *TII,
+                       const TargetRegisterInfo *TRI) const {
+      auto Reg0 = Spills.front().MO->getReg();
+      return TII->getVectorRegisterClassForSpill2Reg(TRI, Reg0);
+    }
 #ifndef NDEBUG
     LLVM_DUMP_METHOD void dump() const;
 #endif
@@ -73,6 +82,22 @@ class Spill2Reg : public MachineFunctionPass {
   /// Look for candidates for spill2reg. These candidates are in places with
   /// high memory unit contention. Fills in StackSlotData.
   void collectSpillsAndReloads();
+  /// \Returns if \p MI is profitable to apply spill-to-reg by checking whether
+  /// this would remove pipeline bubbles.
+  bool isProfitable(const MachineInstr *MI) const;
+  /// \Returns true if any stack-based spill/reload in \p Entry is profitable
+  /// to replace with a reg-based spill/reload.
+  bool allAccessesProfitable(const StackSlotDataEntry &Entry) const;
+  /// Look for a free physical register in \p LRU of reg class \p RegClass.
+  std::optional<MCRegister>
+  tryGetFreePhysicalReg(const TargetRegisterClass *RegClass,
+                        const LiveRegUnits &LRU);
+  /// Helper for generateCode(). It eplaces stack spills or reloads with movs
+  /// to \p LI.reg().
+  void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg);
+  /// Updates \p LRU with the liveness of physical registers around the spills
+  /// and reloads in \p Entry.
+  void calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU);
   /// Replace spills to stack with spills to registers (same for reloads).
   void generateCode();
   /// Cleanup data structures once the pass is finished.
@@ -88,6 +113,7 @@ class Spill2Reg : public MachineFunctionPass {
   MachineFrameInfo *MFI = nullptr;
   const TargetInstrInfo *TII = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
+  RegisterClassInfo RegClassInfo;
 };
 
 } // namespace
@@ -113,6 +139,8 @@ bool Spill2Reg::runOnMachineFunction(MachineFunction &MFn) {
   if (!TII->targetSupportsSpill2Reg(&MF->getSubtarget()))
     return false;
 
+  RegClassInfo.runOnMachineFunction(MFn);
+
   return run();
 }
 
@@ -174,7 +202,65 @@ void Spill2Reg::collectSpillsAndReloads() {
   }
 }
 
-void Spill2Reg::generateCode() { llvm_unreachable("Unimplemented"); }
+bool Spill2Reg::isProfitable(const MachineInstr *MI) const {
+  // TODO: Unimplemented.
+  return true;
+}
+
+bool Spill2Reg::allAccessesProfitable(const StackSlotDataEntry &Entry) const {
+  auto IsProfitable = [this](const auto &MID) { return isProfitable(MID.MI); };
+  return llvm::all_of(Entry.Spills, IsProfitable) &&
+         llvm::all_of(Entry.Reloads, IsProfitable);
+}
+
+std::optional<MCRegister>
+Spill2Reg::tryGetFreePhysicalReg(const TargetRegisterClass *RegClass,
+                                 const LiveRegUnits &LRU) {
+  auto Order = RegClassInfo.getOrder(RegClass);
+  for (auto I = Order.begin(), E = Order.end(); I != E; ++I) {
+    MCRegister PhysVectorReg = *I;
+    if (LRU.available(PhysVectorReg))
+      return PhysVectorReg;
+  }
+  return std::nullopt;
+}
+
+// Replace stack-based spills/reloads with register-based ones.
+void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
+                                    Register VectorReg) {
+  // TODO: Unimplemented
+}
+
+void Spill2Reg::calculateLiveRegs(StackSlotDataEntry &Entry,
+                                  LiveRegUnits &LRU) {
+  // TODO: Unimplemented
+}
+
+void Spill2Reg::generateCode() {
+  for (auto &Pair : StackSlotData) {
+    StackSlotDataEntry &Entry = Pair.second;
+    // Skip if this stack slot was disabled during data collection.
+    if (Entry.Disable)
+      continue;
+
+    // We decide to spill2reg if any of the spills/reloads are in a hotspot.
+    if (!allAccessesProfitable(Entry))
+      continue;
+
+    // Calculate liveness for Entry.
+    LiveRegUnits LRU(*TRI);
+    calculateLiveRegs(Entry, LRU);
+
+    // Look for a physical register that in LRU.
+    std::optional<MCRegister> PhysVectorRegOpt =
+        tryGetFreePhysicalReg(Entry.getSpilledRegClass(TII, TRI), LRU);
+    if (!PhysVectorRegOpt)
+      continue;
+
+    // Replace stack accesses with register accesses.
+    replaceStackWithReg(Entry, *PhysVectorRegOpt);
+  }
+}
 
 void Spill2Reg::cleanup() { StackSlotData.clear(); }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index bf136a9a00b295..608d310cc1eb61 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10949,5 +10949,13 @@ bool X86InstrInfo::targetSupportsSpill2Reg(
   return X86STI->hasSSE41();
 }
 
+const TargetRegisterClass *
+X86InstrInfo::getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
+                                                 Register SpilledReg) const {
+  const TargetRegisterClass *VecRegClass =
+      TRI->getRegClass(X86::VR128RegClassID);
+  return VecRegClass;
+}
+
 #define GET_INSTRINFO_HELPERS
 #include "X86GenInstrInfo.inc"
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 000e0c579118bc..b959d69afd9332 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -747,6 +747,10 @@ class X86InstrInfo final : public X86GenInstrInfo {
                           const MachineRegisterInfo *MRI) const override;
 
   bool targetSupportsSpill2Reg(const TargetSubtargetInfo *STI) const override;
+
+  const TargetRegisterClass *
+  getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
+                                     Register SpilledReg) const override;
 };
 } // namespace llvm
 

>From 0f88d141c0236e23761bf7eeeaa9c33512db23d1 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 02:10:24 -0800
Subject: [PATCH 4/9] [Spill2Reg] Adds x86 profitability model

This patch implements a simple model for checking when Spill2Reg is profitable
in x86. It walks through instructions in the proximity of the spill or reload
and counts the number of memory and vector instructions. If the count of
memory instructions is greater than a threshold and the count of vector
instructions less than a threshold, then we consider it to be profitable.

Original review: https://reviews.llvm.org/D118301
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h |  28 +++++
 llvm/lib/Target/X86/X86InstrInfo.cpp        | 128 ++++++++++++++++++++
 llvm/lib/Target/X86/X86InstrInfo.h          |  16 +++
 3 files changed, 172 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index ca99e1a31da022..33a0ed3c23c160 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2313,6 +2313,34 @@ class TargetInstrInfo : public MCInstrInfo {
         "TargetInstrInfo::createVirtualVectorRegisterForSpillToReg!");
   }
 
+  /// \Returns true if it is profitable to perform spill2reg on \p MI.
+  virtual bool isSpill2RegProfitable(const MachineInstr *MI,
+                                     const TargetRegisterInfo *TRI,
+                                     const MachineRegisterInfo *MRI) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::isSpill2RegProfitable!");
+  }
+
+  /// Inserts \p SrcReg into the first lane of \p DstReg.
+  virtual MachineInstr *
+  spill2RegInsertToVectorReg(Register DstReg, Register SrcReg,
+                             int OperationBits, MachineBasicBlock *MBB,
+                             MachineBasicBlock::iterator InsertBeforeIt,
+                             const TargetRegisterInfo *TRI) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::spill2RegInsertToVectorReg!");
+  }
+
+  /// Extracts the first lane of \p SrcReg into \p DstReg.
+  virtual MachineInstr *
+  spill2RegExtractFromVectorReg(Register DstReg, Register SrcReg,
+                                int OperationBits, MachineBasicBlock *InsertMBB,
+                                MachineBasicBlock::iterator InsertBeforeIt,
+                                const TargetRegisterInfo *TRI) const {
+    llvm_unreachable("Target didn't implement "
+                     "TargetInstrInfo::spill2RegExtractFromVectorReg!");
+  }
+
 private:
   mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 608d310cc1eb61..8f08f8bff75980 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -78,6 +78,22 @@ static cl::opt<unsigned> UndefRegClearance(
              "certain undef register reads"),
     cl::init(128), cl::Hidden);
 
+// A value of 2 was empirically found to work on Skylake.
+static cl::opt<int> Spill2RegMemInstrsThreshold(
+    "spill2reg-mem-instrs", cl::Hidden, cl::init(80),
+    cl::desc("Apply spill2reg if we find at least this much percentage of "
+             "memory nstrs within the explored distance."));
+
+static cl::opt<int> Spill2RegVecInstrsThreshold(
+    "spill2reg-vec-instrs", cl::Hidden, cl::init(1),
+    cl::desc("Apply spill2reg if we find fewer than this many vector instrs "
+             "within the explored distance."));
+
+static cl::opt<int> Spill2RegExplorationDst(
+    "spill2reg-exploration-distance", cl::Hidden, cl::init(4),
+    cl::desc("When checking for profitability, explore nearby instructions "
+             "at this maximum distance."));
+
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
 
@@ -10957,5 +10973,117 @@ X86InstrInfo::getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
   return VecRegClass;
 }
 
+bool X86InstrInfo::isSpill2RegProfitable(const MachineInstr *MI,
+                                         const TargetRegisterInfo *TRI,
+                                         const MachineRegisterInfo *MRI) const {
+  auto IsVecMO = [TRI, MI](const MachineOperand &MO) {
+    const MachineFunction *MF = MI->getParent()->getParent();
+    if (MO.isReg() && MO.getReg().isPhysical()) {
+      for (auto ClassID :
+           {X86::VR128RegClassID, X86::VR256RegClassID, X86::VR512RegClassID})
+        if (TRI->getRegClass(ClassID)->contains(MO.getReg()))
+          return true;
+    }
+    if (MO.isFI()) {
+      const unsigned MinVecBits =
+          TRI->getRegSizeInBits(*TRI->getRegClass(X86::VR128RegClassID));
+      if (MF->getFrameInfo().getObjectSize(MO.getIndex()) >= MinVecBits)
+        return true;
+    }
+    return false;
+  };
+
+  /// \Returns the previous instruction, skipping debug instrs.
+  auto GetPrevNonDebug = [](const MachineInstr *MI) {
+    do {
+      MI = MI->getPrevNode();
+    } while (MI != nullptr && MI->isDebugInstr());
+    return MI;
+  };
+  /// \Returns the next instruction, skipping debug instrs.
+  auto GetNextNonDebug = [](const MachineInstr *MI) {
+    do {
+      MI = MI->getNextNode();
+    } while (MI != nullptr && MI->isDebugInstr());
+    return MI;
+  };
+
+  // This is a simple heuristic. We count the number of memory and vector
+  // instructions both above and below `MI` with a radius of
+  // Spill2RegExplorationDst, and check against threshold values.
+  int CntMem = 0;
+  int CntAll = 0;
+  int CntVec = 0;
+  const MachineInstr *TopMI = MI;
+  const MachineInstr *BotMI = GetNextNonDebug(MI);
+  for (int Radius = 0, MaxRadius = Spill2RegExplorationDst;
+       (TopMI != nullptr || BotMI != nullptr) && Radius < MaxRadius; ++Radius) {
+    if (TopMI != nullptr && !TopMI->memoperands_empty())
+      ++CntMem;
+    if (BotMI != nullptr && !BotMI->memoperands_empty())
+      ++CntMem;
+    if (TopMI != nullptr && llvm::any_of(TopMI->operands(), IsVecMO))
+      ++CntVec;
+    if (BotMI != nullptr && llvm::any_of(BotMI->operands(), IsVecMO))
+      ++CntVec;
+
+    if (TopMI != nullptr) {
+      TopMI = GetPrevNonDebug(TopMI);
+      ++CntAll;
+    }
+    if (BotMI != nullptr) {
+      BotMI = GetNextNonDebug(BotMI);
+      ++CntAll;
+    }
+  }
+  // Return false if exploration ended early because we reached the end of BB.
+  if (Spill2RegMemInstrsThreshold != 0 && CntAll < 2 * Spill2RegExplorationDst)
+    return false;
+  // Else check against the thresholds.
+  bool MemHeuristic = Spill2RegMemInstrsThreshold == 0 ||
+                      (CntMem * 100) / CntAll >= Spill2RegMemInstrsThreshold;
+  bool VecHeuristic =
+      Spill2RegVecInstrsThreshold == 0 || CntVec < Spill2RegVecInstrsThreshold;
+  return MemHeuristic && VecHeuristic;
+}
+
+static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert) {
+  switch (Bits) {
+  case 32:
+    return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
+  case 64:
+    return Insert ? X86::MOV64toPQIrr : X86::MOVPQIto64rr;
+  default:
+    llvm_unreachable("Unsupported bits");
+  }
+}
+
+MachineInstr *X86InstrInfo::spill2RegInsertToVectorReg(
+    Register DstReg, Register SrcReg, int OperationBits, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator InsertBeforeIt,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  unsigned InsertOpcode =
+      getInsertOrExtractOpcode(OperationBits, true /*insert*/);
+  const MCInstrDesc &InsertMCID = get(InsertOpcode);
+  MachineInstr *InsertMI =
+      BuildMI(*MBB, InsertBeforeIt, DL, InsertMCID, DstReg).addReg(SrcReg);
+  return InsertMI;
+}
+
+MachineInstr *X86InstrInfo::spill2RegExtractFromVectorReg(
+    Register DstReg, Register SrcReg, int OperationBits,
+    MachineBasicBlock *InsertMBB, MachineBasicBlock::iterator InsertBeforeIt,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  unsigned ExtractOpcode =
+      getInsertOrExtractOpcode(OperationBits, false /*extract*/);
+  const MCInstrDesc &ExtractMCID = get(ExtractOpcode);
+  MachineInstr *ExtractMI =
+      BuildMI(*InsertMBB, InsertBeforeIt, DL, ExtractMCID, DstReg)
+          .addReg(SrcReg);
+  return ExtractMI;
+}
+
 #define GET_INSTRINFO_HELPERS
 #include "X86GenInstrInfo.inc"
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index b959d69afd9332..3d67a5e4522e71 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -751,6 +751,22 @@ class X86InstrInfo final : public X86GenInstrInfo {
   const TargetRegisterClass *
   getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
                                      Register SpilledReg) const override;
+
+  bool isSpill2RegProfitable(const MachineInstr *MI,
+                             const TargetRegisterInfo *TRI,
+                             const MachineRegisterInfo *MRI) const override;
+
+  MachineInstr *
+  spill2RegInsertToVectorReg(Register DstReg, Register SrcReg,
+                             int OperationBits, MachineBasicBlock *MBB,
+                             MachineBasicBlock::iterator InsertBeforeIt,
+                             const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *
+  spill2RegExtractFromVectorReg(Register DstReg, Register SrcReg,
+                                int OperationBits, MachineBasicBlock *InsertMBB,
+                                MachineBasicBlock::iterator InsertBeforeIt,
+                                const TargetRegisterInfo *TRI) const override;
 };
 } // namespace llvm
 

>From 92e26f035193c406f22e6a0d542bd16d55efdba9 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 02:15:11 -0800
Subject: [PATCH 5/9] [Spill2Reg] Code generation part 2

Spill2Reg can now emit spill and reload instructions.
This will not generate correct code, as it does not keep track of live regs.

Original review: https://reviews.llvm.org/D118302
---
 llvm/lib/CodeGen/Spill2Reg.cpp                |  37 +++-
 .../X86/spill2reg_avoid_vector_instrs.mir     |  48 +++++
 ...spill2reg_disable_when_noimplicitfloat.mir |  37 ++++
 .../CodeGen/X86/spill2reg_mask_spills.mir     |  33 +++
 .../CodeGen/X86/spill2reg_simple_1_32bit.mir  |  47 ++++
 .../CodeGen/X86/spill2reg_simple_1_64bit.mir  |  47 ++++
 llvm/test/CodeGen/X86/spill2reg_simple_2.mir  |  52 +++++
 llvm/test/CodeGen/X86/spill2reg_simple_3.mir  | 203 ++++++++++++++++++
 .../Transforms/SLPVectorizer/stores_init.ll   |  10 +
 9 files changed, 511 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_avoid_vector_instrs.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_disable_when_noimplicitfloat.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_mask_spills.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_simple_2.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_simple_3.mir
 create mode 100644 llvm/test/Transforms/SLPVectorizer/stores_init.ll

diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
index 76ee001921f73f..7c6e0a5dd64d33 100644
--- a/llvm/lib/CodeGen/Spill2Reg.cpp
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -16,6 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AllocationOrder.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -29,6 +30,9 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "Spill2Reg"
+STATISTIC(NumSpill2RegInstrs, "Number of spills/reloads replaced by spill2reg");
+
 namespace {
 
 class Spill2Reg : public MachineFunctionPass {
@@ -203,8 +207,7 @@ void Spill2Reg::collectSpillsAndReloads() {
 }
 
 bool Spill2Reg::isProfitable(const MachineInstr *MI) const {
-  // TODO: Unimplemented.
-  return true;
+  return TII->isSpill2RegProfitable(MI, TRI, MRI);
 }
 
 bool Spill2Reg::allAccessesProfitable(const StackSlotDataEntry &Entry) const {
@@ -228,7 +231,33 @@ Spill2Reg::tryGetFreePhysicalReg(const TargetRegisterClass *RegClass,
 // Replace stack-based spills/reloads with register-based ones.
 void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
                                     Register VectorReg) {
-  // TODO: Unimplemented
+  for (StackSlotDataEntry::MIData &SpillData : Entry.Spills) {
+    MachineInstr *StackSpill = SpillData.MI;
+    assert(SpillData.MO->isReg() && "Expected register MO");
+    Register OldReg = SpillData.MO->getReg();
+
+    MachineInstr *SpillToVector = TII->spill2RegInsertToVectorReg(
+        VectorReg, OldReg, SpillData.SpillBits, StackSpill->getParent(),
+        /*InsertBeforeIt=*/StackSpill->getIterator(), TRI);
+
+    // Spill to stack is no longer needed.
+    StackSpill->eraseFromParent();
+    assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
+  }
+
+  for (StackSlotDataEntry::MIData &ReloadData : Entry.Reloads) {
+    MachineInstr *StackReload = ReloadData.MI;
+    assert(ReloadData.MO->isReg() && "Expected Reg MO");
+    Register OldReg = ReloadData.MO->getReg();
+
+    MachineInstr *ReloadFromReg = TII->spill2RegExtractFromVectorReg(
+        OldReg, VectorReg, ReloadData.SpillBits, StackReload->getParent(),
+        /*InsertBeforeIt=*/StackReload->getIterator(), TRI);
+
+    // Reload from stack is no longer needed.
+    StackReload->eraseFromParent();
+    assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
+  }
 }
 
 void Spill2Reg::calculateLiveRegs(StackSlotDataEntry &Entry,
@@ -259,6 +288,8 @@ void Spill2Reg::generateCode() {
 
     // Replace stack accesses with register accesses.
     replaceStackWithReg(Entry, *PhysVectorRegOpt);
+
+    NumSpill2RegInstrs += Entry.Spills.size() + Entry.Reloads.size();
   }
 }
 
diff --git a/llvm/test/CodeGen/X86/spill2reg_avoid_vector_instrs.mir b/llvm/test/CodeGen/X86/spill2reg_avoid_vector_instrs.mir
new file mode 100644
index 00000000000000..e01a9bfe7bf82c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_avoid_vector_instrs.mir
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 | FileCheck %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s --check-prefix=FORCED
+
+# Simple test to confirm that spill2reg won't apply if there is a vector
+# instruction nearby.
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i64 0, align 4
+  @U0 = dso_local local_unnamed_addr global i64 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 8, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+
+
+  bb.0:
+    ; CHECK-LABEL: name: func
+    ; CHECK: $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    ; CHECK-NEXT: $xmm15 = MOV64toPQIrr $rax
+    ; CHECK-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+    ; CHECK-NEXT: $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+    ; CHECK-NEXT: MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    ; CHECK-NEXT: RET 0
+    ; FORCED-LABEL: name: func
+    ; FORCED: $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    ; FORCED-NEXT: $xmm15 = MOV64toPQIrr $rax
+    ; FORCED-NEXT: $xmm0 = MOV64toPQIrr $rax
+    ; FORCED-NEXT: $rax = MOVPQIto64rr $xmm0
+    ; FORCED-NEXT: MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    ; FORCED-NEXT: RET 0
+    $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    $xmm15 = MOV64toPQIrr $rax
+    MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+    ; reload
+    $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+    MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_disable_when_noimplicitfloat.mir b/llvm/test/CodeGen/X86/spill2reg_disable_when_noimplicitfloat.mir
new file mode 100644
index 00000000000000..0b14dc79225359
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_disable_when_noimplicitfloat.mir
@@ -0,0 +1,37 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that Spill2reg is disabled if the NoImplicitFloat attribute is set.
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() #0 { ret void }
+
+  attributes #0 = { noimplicitfloat }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: func
+    ; CHECK: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; CHECK-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; CHECK-NEXT: RET 0
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_mask_spills.mir b/llvm/test/CodeGen/X86/spill2reg_mask_spills.mir
new file mode 100644
index 00000000000000..bf059e914b67b7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_mask_spills.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx512f --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Checks that spills reading from $k mask registers are skipped by Spill2Reg.
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $k1
+    ; CHECK-LABEL: name: func
+    ; CHECK: liveins: $k1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: KMOVWmk %stack.0, 1, $noreg, 0, $noreg, killed renamable $k1 :: (store (s16) into %stack.0)
+    ; CHECK-NEXT: renamable $k1 = KMOVWkm %stack.0, 1, $noreg, 0, $noreg :: (load (s16) from %stack.0)
+    ; CHECK-NEXT: RET 0
+    KMOVWmk %stack.0, 1, $noreg, 0, $noreg, killed renamable $k1 :: (store (s16) into %stack.0)
+    renamable $k1 = KMOVWkm %stack.0, 1, $noreg, 0, $noreg :: (load (s16) from %stack.0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir
new file mode 100644
index 00000000000000..ddcf84134d89f7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=-sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=NOSSE %s
+
+# Simple test with a single spill-reload pair (32-bit version):
+#   spill stack.0
+#   reload stack.0
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+
+
+  bb.0:
+    ; spill
+    ; CHECK-LABEL: name: func
+    ; CHECK: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; CHECK-NEXT: RET 0
+    ; NOSSE-LABEL: name: func
+    ; NOSSE: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; NOSSE-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    ; NOSSE-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    ; NOSSE-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; NOSSE-NEXT: RET 0
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    ; reload
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir
new file mode 100644
index 00000000000000..5d5baa730a5983
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=-sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=NOSSE %s
+
+# Simple test with a single spill-reload pair (64-bit version):
+#   spill stack.0
+#   reload stack.0
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i64 0, align 4
+  @U0 = dso_local local_unnamed_addr global i64 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 8, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+
+
+  bb.0:
+    ; spill
+    ; CHECK-LABEL: name: func
+    ; CHECK: $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    ; CHECK-NEXT: $xmm0 = MOV64toPQIrr $rax
+    ; CHECK-NEXT: $rax = MOVPQIto64rr $xmm0
+    ; CHECK-NEXT: MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    ; CHECK-NEXT: RET 0
+    ; NOSSE-LABEL: name: func
+    ; NOSSE: $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    ; NOSSE-NEXT: MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+    ; NOSSE-NEXT: $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+    ; NOSSE-NEXT: MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    ; NOSSE-NEXT: RET 0
+    $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+    ; reload
+    $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+    MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir b/llvm/test/CodeGen/X86/spill2reg_simple_2.mir
new file mode 100644
index 00000000000000..a8cfa501436d79
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_2.mir
@@ -0,0 +1,52 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Simple test with two overlapping spill-reload pairs.
+#   spill stack.0
+#   spill stack.1
+#   reload stack.0
+#   reload stack.1
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+
+  bb.0:
+    ; CHECK-LABEL: name: func
+    ; CHECK: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    ; CHECK-NEXT: RET 0
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_3.mir b/llvm/test/CodeGen/X86/spill2reg_simple_3.mir
new file mode 100644
index 00000000000000..5a87fb562bb2c0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_3.mir
@@ -0,0 +1,203 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s --check-prefix=MEM0
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=50 -spill2reg-vec-instrs=99999 | FileCheck %s --check-prefix=MEM50
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=100 -spill2reg-vec-instrs=99999 | FileCheck %s --check-prefix=MEM100
+
+
+# Simple test with several overlapping spill-reload pairs.
+# This tests the -spill2reg-mem-instrs flag.
+
+#   spill stack.0
+#   spill stack.1
+#   spill stack.2
+#   spill stack.3
+#   spill stack.4
+#   spill stack.5
+#   spill stack.6
+#   spill stack.7
+#   reload stack.0
+#   reload stack.1
+#   reload stack.2
+#   reload stack.3
+#   reload stack.4
+#   reload stack.5
+#   reload stack.6
+#   reload stack.7
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @D2 = dso_local local_unnamed_addr global i32 0, align 4
+  @D3 = dso_local local_unnamed_addr global i32 0, align 4
+  @D4 = dso_local local_unnamed_addr global i32 0, align 4
+  @D5 = dso_local local_unnamed_addr global i32 0, align 4
+  @D6 = dso_local local_unnamed_addr global i32 0, align 4
+  @D7 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U2 = dso_local local_unnamed_addr global i32 0, align 4
+  @U3 = dso_local local_unnamed_addr global i32 0, align 4
+  @U4 = dso_local local_unnamed_addr global i32 0, align 4
+  @U5 = dso_local local_unnamed_addr global i32 0, align 4
+  @U6 = dso_local local_unnamed_addr global i32 0, align 4
+  @U7 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 2, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 3, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 4, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 5, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 6, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 7, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+
+  bb.0:
+    ; MEM0-LABEL: name: func
+    ; MEM0: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D2, $noreg :: (dereferenceable load (s32) from @D2)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D3, $noreg :: (dereferenceable load (s32) from @D3)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D4, $noreg :: (dereferenceable load (s32) from @D4)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D5, $noreg :: (dereferenceable load (s32) from @D5)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D6, $noreg :: (dereferenceable load (s32) from @D6)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D7, $noreg :: (dereferenceable load (s32) from @D7)
+    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U2, $noreg, killed renamable $eax :: (store (s32) into @U2)
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U3, $noreg, killed renamable $eax :: (store (s32) into @U3)
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U4, $noreg, killed renamable $eax :: (store (s32) into @U4)
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U5, $noreg, killed renamable $eax :: (store (s32) into @U5)
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U6, $noreg, killed renamable $eax :: (store (s32) into @U6)
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U7, $noreg, killed renamable $eax :: (store (s32) into @U7)
+    ; MEM0-NEXT: RET 0
+    ; MEM50-LABEL: name: func
+    ; MEM50: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; MEM50-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D2, $noreg :: (dereferenceable load (s32) from @D2)
+    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D3, $noreg :: (dereferenceable load (s32) from @D3)
+    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D4, $noreg :: (dereferenceable load (s32) from @D4)
+    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D5, $noreg :: (dereferenceable load (s32) from @D5)
+    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D6, $noreg :: (dereferenceable load (s32) from @D6)
+    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D7, $noreg :: (dereferenceable load (s32) from @D7)
+    ; MEM50-NEXT: MOV32mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.7)
+    ; MEM50-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U2, $noreg, killed renamable $eax :: (store (s32) into @U2)
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U3, $noreg, killed renamable $eax :: (store (s32) into @U3)
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U4, $noreg, killed renamable $eax :: (store (s32) into @U4)
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U5, $noreg, killed renamable $eax :: (store (s32) into @U5)
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U6, $noreg, killed renamable $eax :: (store (s32) into @U6)
+    ; MEM50-NEXT: $eax = MOV32rm %stack.7, 1, $noreg, 0, $noreg :: (load (s32) from %stack.7)
+    ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U7, $noreg, killed renamable $eax :: (store (s32) into @U7)
+    ; MEM50-NEXT: RET 0
+    ; MEM100-LABEL: name: func
+    ; MEM100: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; MEM100-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    ; MEM100-NEXT: MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+    ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D2, $noreg :: (dereferenceable load (s32) from @D2)
+    ; MEM100-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D3, $noreg :: (dereferenceable load (s32) from @D3)
+    ; MEM100-NEXT: MOV32mr %stack.3, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.3)
+    ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D4, $noreg :: (dereferenceable load (s32) from @D4)
+    ; MEM100-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D5, $noreg :: (dereferenceable load (s32) from @D5)
+    ; MEM100-NEXT: MOV32mr %stack.5, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.5)
+    ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D6, $noreg :: (dereferenceable load (s32) from @D6)
+    ; MEM100-NEXT: MOV32mr %stack.6, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.6)
+    ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D7, $noreg :: (dereferenceable load (s32) from @D7)
+    ; MEM100-NEXT: MOV32mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.7)
+    ; MEM100-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; MEM100-NEXT: $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    ; MEM100-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U2, $noreg, killed renamable $eax :: (store (s32) into @U2)
+    ; MEM100-NEXT: $eax = MOV32rm %stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %stack.3)
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U3, $noreg, killed renamable $eax :: (store (s32) into @U3)
+    ; MEM100-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U4, $noreg, killed renamable $eax :: (store (s32) into @U4)
+    ; MEM100-NEXT: $eax = MOV32rm %stack.5, 1, $noreg, 0, $noreg :: (load (s32) from %stack.5)
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U5, $noreg, killed renamable $eax :: (store (s32) into @U5)
+    ; MEM100-NEXT: $eax = MOV32rm %stack.6, 1, $noreg, 0, $noreg :: (load (s32) from %stack.6)
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U6, $noreg, killed renamable $eax :: (store (s32) into @U6)
+    ; MEM100-NEXT: $eax = MOV32rm %stack.7, 1, $noreg, 0, $noreg :: (load (s32) from %stack.7)
+    ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U7, $noreg, killed renamable $eax :: (store (s32) into @U7)
+    ; MEM100-NEXT: RET 0
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+    $eax = MOV32rm $rip, 1, $noreg, @D2, $noreg :: (dereferenceable load (s32) from @D2)
+    MOV32mr %stack.2, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.2)
+    $eax = MOV32rm $rip, 1, $noreg, @D3, $noreg :: (dereferenceable load (s32) from @D3)
+    MOV32mr %stack.3, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.3)
+    $eax = MOV32rm $rip, 1, $noreg, @D4, $noreg :: (dereferenceable load (s32) from @D4)
+    MOV32mr %stack.4, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.4)
+    $eax = MOV32rm $rip, 1, $noreg, @D5, $noreg :: (dereferenceable load (s32) from @D5)
+    MOV32mr %stack.5, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.5)
+    $eax = MOV32rm $rip, 1, $noreg, @D6, $noreg :: (dereferenceable load (s32) from @D6)
+    MOV32mr %stack.6, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.6)
+    $eax = MOV32rm $rip, 1, $noreg, @D7, $noreg :: (dereferenceable load (s32) from @D7)
+    MOV32mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.7)
+
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    $eax = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
+    MOV32mr $rip, 1, $noreg, @U2, $noreg, killed renamable $eax :: (store (s32) into @U2)
+    $eax = MOV32rm %stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %stack.3)
+    MOV32mr $rip, 1, $noreg, @U3, $noreg, killed renamable $eax :: (store (s32) into @U3)
+    $eax = MOV32rm %stack.4, 1, $noreg, 0, $noreg :: (load (s32) from %stack.4)
+    MOV32mr $rip, 1, $noreg, @U4, $noreg, killed renamable $eax :: (store (s32) into @U4)
+    $eax = MOV32rm %stack.5, 1, $noreg, 0, $noreg :: (load (s32) from %stack.5)
+    MOV32mr $rip, 1, $noreg, @U5, $noreg, killed renamable $eax :: (store (s32) into @U5)
+    $eax = MOV32rm %stack.6, 1, $noreg, 0, $noreg :: (load (s32) from %stack.6)
+    MOV32mr $rip, 1, $noreg, @U6, $noreg, killed renamable $eax :: (store (s32) into @U6)
+    $eax = MOV32rm %stack.7, 1, $noreg, 0, $noreg :: (load (s32) from %stack.7)
+    MOV32mr $rip, 1, $noreg, @U7, $noreg, killed renamable $eax :: (store (s32) into @U7)
+    RET 0
+
+...
diff --git a/llvm/test/Transforms/SLPVectorizer/stores_init.ll b/llvm/test/Transforms/SLPVectorizer/stores_init.ll
new file mode 100644
index 00000000000000..7e7813c13311d3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/stores_init.ll
@@ -0,0 +1,10 @@
+define void @stores_init(ptr %ptr) {
+  %gep0 = getelementptr i8, ptr %ptr, i64 0
+  %gep1 = getelementptr i8, ptr %ptr, i64 1
+  %gep2 = getelementptr i8, ptr %ptr, i64 2
+  %gep3 = getelementptr i8, ptr %ptr, i64 3
+  store i8 0, ptr %gep0
+  store i8 1, ptr %gep1
+  store i16 2, ptr %gep2
+  ret void
+}

>From 38569d9afb31b6746ed43c49e2a6ecd7d51c1252 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 02:17:38 -0800
Subject: [PATCH 6/9] [Spill2Reg] Add live register tracking

This patch implements tracking of live registers. This is used to look for free
vector registers. It works by walking up the CFG from the reloads all the way
to the spills, accumulating the register units being used.
This implementation caches the live register units used by each MBB for faster
compilation time.

Note: Live register tracking relies on MBB liveins/outs being maintained
correctly, which is implemented in a follow-up patch. So this patch will still
not generate correct code for all but some simple cases.

Original review: https://reviews.llvm.org/D118303
---
 llvm/lib/CodeGen/Spill2Reg.cpp                | 135 ++++++++++++-
 .../CodeGen/X86/spill2reg_end_to_end_16bit.ll | 177 +++++++++++++++++
 .../CodeGen/X86/spill2reg_end_to_end_32bit.ll | 178 ++++++++++++++++++
 .../CodeGen/X86/spill2reg_end_to_end_64bit.ll | 177 +++++++++++++++++
 .../CodeGen/X86/spill2reg_end_to_end_8bit.ll  | 177 +++++++++++++++++
 .../X86/spill2reg_liveregs_all_live.mir       |  62 ++++++
 .../CodeGen/X86/spill2reg_liveregs_call.mir   |  46 +++++
 .../X86/spill2reg_liveregs_defined_in_bb.mir  |  57 ++++++
 .../CodeGen/X86/spill2reg_liveregs_livein.mir |  63 +++++++
 ...g_liveregs_reload_mbb_and_intermediate.mir |  75 ++++++++
 llvm/test/CodeGen/X86/spill2reg_simple_2.mir  |   4 +-
 llvm/test/CodeGen/X86/spill2reg_simple_3.mir  |  52 ++---
 12 files changed, 1170 insertions(+), 33 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_all_live.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_call.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_defined_in_bb.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_livein.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_reload_mbb_and_intermediate.mir

diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
index 7c6e0a5dd64d33..29d9f9509dd7f7 100644
--- a/llvm/lib/CodeGen/Spill2Reg.cpp
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -99,6 +99,9 @@ class Spill2Reg : public MachineFunctionPass {
   /// Helper for generateCode(). It eplaces stack spills or reloads with movs
   /// to \p LI.reg().
   void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg);
+  /// Updates the live-ins of MBBs after we emit the new spill2reg instructions
+  /// and the vector registers become live from register spills to reloads.
+  void updateLiveIns(StackSlotDataEntry &Entry, MCRegister VectorReg);
   /// Updates \p LRU with the liveness of physical registers around the spills
   /// and reloads in \p Entry.
   void calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU);
@@ -111,6 +114,9 @@ class Spill2Reg : public MachineFunctionPass {
 
   /// Map from a stack slot to the corresponding spills and reloads.
   DenseMap<int, StackSlotDataEntry> StackSlotData;
+  /// The registers used by each block (from LiveRegUnits). This is needed for
+  /// finding free physical registers in the generateCode().
+  DenseMap<const MachineBasicBlock *, LiveRegUnits> LRUs;
 
   MachineFunction *MF = nullptr;
   MachineRegisterInfo *MRI = nullptr;
@@ -169,7 +175,16 @@ void Spill2Reg::collectSpillsAndReloads() {
   // If any spill/reload for a stack slot is found not to be eligible for
   // spill-to-reg, then that stack slot is disabled.
   for (MachineBasicBlock &MBB : *MF) {
-    for (MachineInstr &MI : MBB) {
+    // Initialize AccumMBBLRU for keeping track of physical registers used
+    // across the whole MBB.
+    LiveRegUnits AccumMBBLRU(*TRI);
+    AccumMBBLRU.addLiveOuts(MBB);
+
+    // Collect spills/reloads
+    for (MachineInstr &MI : llvm::reverse(MBB)) {
+      // Update the LRU state as we move upwards.
+      AccumMBBLRU.accumulate(MI);
+
       int StackSlot;
       if (const MachineOperand *MO = TII->isStoreToStackSlotMO(MI, StackSlot)) {
         MachineInstr *Spill = &MI;
@@ -203,6 +218,8 @@ void Spill2Reg::collectSpillsAndReloads() {
           }
       }
     }
+
+    LRUs.insert(std::make_pair(&MBB, AccumMBBLRU));
   }
 }
 
@@ -228,6 +245,26 @@ Spill2Reg::tryGetFreePhysicalReg(const TargetRegisterClass *RegClass,
   return std::nullopt;
 }
 
+/// Perform a bottom-up depth-first traversal from \p MBB at \p MI towards its
+/// predecessors blocks. Visited marks the visited blocks. \p Fn is the
+/// callback function called in pre-order. If \p Fn returns true we stop the
+/// traversal.
+// TODO: Use df_iterator
+static void DFS(MachineBasicBlock *MBB, DenseSet<MachineBasicBlock *> &Visited,
+                std::function<bool(MachineBasicBlock *)> Fn) {
+  // Skip visited to avoid infinite loops.
+  if (Visited.count(MBB))
+    return;
+  Visited.insert(MBB);
+
+  // Preorder.
+  if (Fn(MBB))
+    return;
+
+  // Depth-first across predecessors.
+  for (MachineBasicBlock *PredMBB : MBB->predecessors())
+    DFS(PredMBB, Visited, Fn);
+}
 // Replace stack-based spills/reloads with register-based ones.
 void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
                                     Register VectorReg) {
@@ -236,10 +273,13 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
     assert(SpillData.MO->isReg() && "Expected register MO");
     Register OldReg = SpillData.MO->getReg();
 
-    MachineInstr *SpillToVector = TII->spill2RegInsertToVectorReg(
+    TII->spill2RegInsertToVectorReg(
         VectorReg, OldReg, SpillData.SpillBits, StackSpill->getParent(),
         /*InsertBeforeIt=*/StackSpill->getIterator(), TRI);
 
+    // Mark VectorReg as live in the instr's BB.
+    LRUs[StackSpill->getParent()].addReg(VectorReg);
+
     // Spill to stack is no longer needed.
     StackSpill->eraseFromParent();
     assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -250,10 +290,13 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
     assert(ReloadData.MO->isReg() && "Expected Reg MO");
     Register OldReg = ReloadData.MO->getReg();
 
-    MachineInstr *ReloadFromReg = TII->spill2RegExtractFromVectorReg(
+    TII->spill2RegExtractFromVectorReg(
         OldReg, VectorReg, ReloadData.SpillBits, StackReload->getParent(),
         /*InsertBeforeIt=*/StackReload->getIterator(), TRI);
 
+    // Mark VectorReg as live in the instr's BB.
+    LRUs[StackReload->getParent()].addReg(VectorReg);
+
     // Reload from stack is no longer needed.
     StackReload->eraseFromParent();
     assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -262,7 +305,86 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
 
 void Spill2Reg::calculateLiveRegs(StackSlotDataEntry &Entry,
                                   LiveRegUnits &LRU) {
-  // TODO: Unimplemented
+  // Collect the parent MBBs of Spills for fast lookup.
+  DenseSet<MachineBasicBlock *> SpillMBBs(Entry.Spills.size());
+  DenseSet<MachineInstr *> Spills(Entry.Spills.size());
+  for (const auto &Data : Entry.Spills) {
+    SpillMBBs.insert(Data.MI->getParent());
+    Spills.insert(Data.MI);
+  }
+
+  /// Walks up the instructions in \p Reload's block, stopping at a spill if
+  /// found. \Returns true if a spill was found, false otherwise.
+  auto AccumulateLRUUntilSpillFn = [&Spills, &SpillMBBs](MachineInstr *Reload,
+                                                         LiveRegUnits &LRU) {
+    MachineBasicBlock *MBB = Reload->getParent();
+    bool IsSpillBlock = SpillMBBs.count(MBB);
+    // Add all MBB's live-outs.
+    LRU.addLiveOuts(*MBB);
+    // Else walk up the BB, starting from MI, looking for any spill.
+    for (MachineInstr *CurrMI = Reload; CurrMI != nullptr;
+         CurrMI = CurrMI->getPrevNode()) {
+      LRU.accumulate(*CurrMI);
+      // If a spill is found then return true to end the recursion.
+      if (IsSpillBlock && Spills.count(CurrMI))
+        return true;
+    }
+    return false;
+  };
+
+  // Helper for the traversal. It accumulates all register units used in \p
+  // MBB from \p MI upwards. It returns true once a spill is found.
+  auto AccumulateLRUFn = [&SpillMBBs, &LRU, AccumulateLRUUntilSpillFn,
+                          this](MachineBasicBlock *MBB) {
+    if (SpillMBBs.count(MBB)) {
+      // If this is a spill block, then walk bottom-up until the spill.
+      assert(!MBB->empty() && "How can it be a spill block and empty?");
+      bool FoundSpill = AccumulateLRUUntilSpillFn(&*MBB->rbegin(), LRU);
+      assert(FoundSpill && "Spill block but we couldn't find spill!");
+      // We return true to stop the recursion.
+      return true;
+    }
+    // Else this is an intermediate block between the spills and reloads and
+    // there is no spill in it, then use the pre-computed LRU to avoid walking
+    // it again. This improves compilation time.
+    LRU.addUnits(LRUs[MBB].getBitVector());
+    // We return false to continue the recursion.
+    return false;
+  };
+
+  /// \Returns the LiveRegUnits at `Reload` by stepping back the BB.
+  auto GetReloadLRU = [this](MachineInstr *Reload) {
+    LiveRegUnits ReloadLRU(*TRI);
+    MachineBasicBlock *MBB = Reload->getParent();
+    ReloadLRU.addLiveOuts(*MBB);
+    // Start at the bottom of the BB and walk up until we find `Reload`.
+    for (MachineInstr &MI : llvm::reverse(*MBB)) {
+      if (&MI == Reload)
+        break;
+      // TODO: Check if this should be accumulate() instead of stepBackward().
+      ReloadLRU.stepBackward(MI);
+    }
+    return ReloadLRU;
+  };
+
+  // Start from each Reload and walk up the CFG with a depth-first traversal,
+  // looking for spills. Upon finding a spill we don't go beyond that point. In
+  // the meantime we accumulate the registers used. This is then used to find
+  // free physical registes.
+  DenseSet<MachineBasicBlock *> Visited;
+  for (const auto &ReloadData : Entry.Reloads) {
+    MachineInstr *Reload = ReloadData.MI;
+    // Add the Reload's LRU to the total LRU for the whole Spill-Reload range.
+    LiveRegUnits ReloadLRU = GetReloadLRU(Reload);
+    bool FoundSpill = AccumulateLRUUntilSpillFn(Reload, ReloadLRU);
+    LRU.addUnits(ReloadLRU.getBitVector());
+
+    // Traverse the CFG bottom-up accumulating LRUs until we reach the Spills.
+    if (!FoundSpill) {
+      for (MachineBasicBlock *PredMBB : Reload->getParent()->predecessors())
+        DFS(PredMBB, Visited, AccumulateLRUFn);
+    }
+  }
 }
 
 void Spill2Reg::generateCode() {
@@ -293,7 +415,10 @@ void Spill2Reg::generateCode() {
   }
 }
 
-void Spill2Reg::cleanup() { StackSlotData.clear(); }
+void Spill2Reg::cleanup() {
+  StackSlotData.clear();
+  LRUs.clear();
+}
 
 bool Spill2Reg::run() {
   // Walk over each instruction in the code keeping track of the processor's
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
new file mode 100644
index 00000000000000..9d454d015fd9e2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+
+; End-to-end check that Spill2Reg works with 16-bit registers.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at D0 = dso_local local_unnamed_addr global i16 0, align 4
+ at D1 = dso_local local_unnamed_addr global i16 0, align 4
+ at D2 = dso_local local_unnamed_addr global i16 0, align 4
+ at D3 = dso_local local_unnamed_addr global i16 0, align 4
+ at D4 = dso_local local_unnamed_addr global i16 0, align 4
+ at D5 = dso_local local_unnamed_addr global i16 0, align 4
+ at D6 = dso_local local_unnamed_addr global i16 0, align 4
+ at D7 = dso_local local_unnamed_addr global i16 0, align 4
+ at D8 = dso_local local_unnamed_addr global i16 0, align 4
+ at D9 = dso_local local_unnamed_addr global i16 0, align 4
+ at D10 = dso_local local_unnamed_addr global i16 0, align 4
+ at D11 = dso_local local_unnamed_addr global i16 0, align 4
+ at D12 = dso_local local_unnamed_addr global i16 0, align 4
+ at D13 = dso_local local_unnamed_addr global i16 0, align 4
+ at D14 = dso_local local_unnamed_addr global i16 0, align 4
+ at D15 = dso_local local_unnamed_addr global i16 0, align 4
+ at D16 = dso_local local_unnamed_addr global i16 0, align 4
+ at D17 = dso_local local_unnamed_addr global i16 0, align 4
+ at D18 = dso_local local_unnamed_addr global i16 0, align 4
+ at U0 = dso_local local_unnamed_addr global i16 0, align 4
+ at U1 = dso_local local_unnamed_addr global i16 0, align 4
+ at U2 = dso_local local_unnamed_addr global i16 0, align 4
+ at U3 = dso_local local_unnamed_addr global i16 0, align 4
+ at U4 = dso_local local_unnamed_addr global i16 0, align 4
+ at U5 = dso_local local_unnamed_addr global i16 0, align 4
+ at U6 = dso_local local_unnamed_addr global i16 0, align 4
+ at U7 = dso_local local_unnamed_addr global i16 0, align 4
+ at U8 = dso_local local_unnamed_addr global i16 0, align 4
+ at U9 = dso_local local_unnamed_addr global i16 0, align 4
+ at U10 = dso_local local_unnamed_addr global i16 0, align 4
+ at U11 = dso_local local_unnamed_addr global i16 0, align 4
+ at U12 = dso_local local_unnamed_addr global i16 0, align 4
+ at U13 = dso_local local_unnamed_addr global i16 0, align 4
+ at U14 = dso_local local_unnamed_addr global i16 0, align 4
+ at U15 = dso_local local_unnamed_addr global i16 0, align 4
+ at U16 = dso_local local_unnamed_addr global i16 0, align 4
+ at U17 = dso_local local_unnamed_addr global i16 0, align 4
+ at U18 = dso_local local_unnamed_addr global i16 0, align 4
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z5spillv() local_unnamed_addr #0 {
+; CHECK-LABEL: _Z5spillv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movzwl D0(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D1(%rip), %ecx
+; CHECK-NEXT:    movzwl D2(%rip), %edx
+; CHECK-NEXT:    movzwl D3(%rip), %esi
+; CHECK-NEXT:    movzwl D4(%rip), %edi
+; CHECK-NEXT:    movzwl D5(%rip), %r8d
+; CHECK-NEXT:    movzwl D6(%rip), %r9d
+; CHECK-NEXT:    movzwl D7(%rip), %r10d
+; CHECK-NEXT:    movzwl D8(%rip), %r11d
+; CHECK-NEXT:    movzwl D9(%rip), %ebx
+; CHECK-NEXT:    movzwl D10(%rip), %ebp
+; CHECK-NEXT:    movzwl D11(%rip), %r14d
+; CHECK-NEXT:    movzwl D12(%rip), %r15d
+; CHECK-NEXT:    movzwl D13(%rip), %r12d
+; CHECK-NEXT:    movzwl D14(%rip), %r13d
+; CHECK-NEXT:    movzwl D15(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D16(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D17(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D18(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U0(%rip)
+; CHECK-NEXT:    movw %cx, U1(%rip)
+; CHECK-NEXT:    movw %dx, U2(%rip)
+; CHECK-NEXT:    movw %si, U3(%rip)
+; CHECK-NEXT:    movw %di, U4(%rip)
+; CHECK-NEXT:    movw %r8w, U5(%rip)
+; CHECK-NEXT:    movw %r9w, U6(%rip)
+; CHECK-NEXT:    movw %r10w, U7(%rip)
+; CHECK-NEXT:    movw %r11w, U8(%rip)
+; CHECK-NEXT:    movw %bx, U9(%rip)
+; CHECK-NEXT:    movw %bp, U10(%rip)
+; CHECK-NEXT:    movw %r14w, U11(%rip)
+; CHECK-NEXT:    movw %r15w, U12(%rip)
+; CHECK-NEXT:    movw %r12w, U13(%rip)
+; CHECK-NEXT:    movw %r13w, U14(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U15(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U16(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U17(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U18(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i16, i16* @D0
+  %1 = load i16, i16* @D1
+  %2 = load i16, i16* @D2
+  %3 = load i16, i16* @D3
+  %4 = load i16, i16* @D4
+  %5 = load i16, i16* @D5
+  %6 = load i16, i16* @D6
+  %7 = load i16, i16* @D7
+  %8 = load i16, i16* @D8
+  %9 = load i16, i16* @D9
+  %10 = load i16, i16* @D10
+  %11 = load i16, i16* @D11
+  %12 = load i16, i16* @D12
+  %13 = load i16, i16* @D13
+  %14 = load i16, i16* @D14
+  %15 = load i16, i16* @D15
+  %16 = load i16, i16* @D16
+  %17 = load i16, i16* @D17
+  %18 = load i16, i16* @D18
+  call void asm sideeffect "", "~{memory}"() #1
+  store i16 %0, i16* @U0
+  store i16 %1, i16* @U1
+  store i16 %2, i16* @U2
+  store i16 %3, i16* @U3
+  store i16 %4, i16* @U4
+  store i16 %5, i16* @U5
+  store i16 %6, i16* @U6
+  store i16 %7, i16* @U7
+  store i16 %8, i16* @U8
+  store i16 %9, i16* @U9
+  store i16 %10, i16* @U10
+  store i16 %11, i16* @U11
+  store i16 %12, i16* @U12
+  store i16 %13, i16* @U13
+  store i16 %14, i16* @U14
+  store i16 %15, i16* @U15
+  store i16 %16, i16* @U16
+  store i16 %17, i16* @U17
+  store i16 %18, i16* @U18
+  ret void
+}
+
+attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll
new file mode 100644
index 00000000000000..3f1811cb971f53
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+
+; End-to-end check that Spill2Reg works with 32-bit registers.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at D0 = dso_local local_unnamed_addr global i32 0, align 4
+ at D1 = dso_local local_unnamed_addr global i32 0, align 4
+ at D2 = dso_local local_unnamed_addr global i32 0, align 4
+ at D3 = dso_local local_unnamed_addr global i32 0, align 4
+ at D4 = dso_local local_unnamed_addr global i32 0, align 4
+ at D5 = dso_local local_unnamed_addr global i32 0, align 4
+ at D6 = dso_local local_unnamed_addr global i32 0, align 4
+ at D7 = dso_local local_unnamed_addr global i32 0, align 4
+ at D8 = dso_local local_unnamed_addr global i32 0, align 4
+ at D9 = dso_local local_unnamed_addr global i32 0, align 4
+ at D10 = dso_local local_unnamed_addr global i32 0, align 4
+ at D11 = dso_local local_unnamed_addr global i32 0, align 4
+ at D12 = dso_local local_unnamed_addr global i32 0, align 4
+ at D13 = dso_local local_unnamed_addr global i32 0, align 4
+ at D14 = dso_local local_unnamed_addr global i32 0, align 4
+ at D15 = dso_local local_unnamed_addr global i32 0, align 4
+ at D16 = dso_local local_unnamed_addr global i32 0, align 4
+ at D17 = dso_local local_unnamed_addr global i32 0, align 4
+ at D18 = dso_local local_unnamed_addr global i32 0, align 4
+ at U0 = dso_local local_unnamed_addr global i32 0, align 4
+ at U1 = dso_local local_unnamed_addr global i32 0, align 4
+ at U2 = dso_local local_unnamed_addr global i32 0, align 4
+ at U3 = dso_local local_unnamed_addr global i32 0, align 4
+ at U4 = dso_local local_unnamed_addr global i32 0, align 4
+ at U5 = dso_local local_unnamed_addr global i32 0, align 4
+ at U6 = dso_local local_unnamed_addr global i32 0, align 4
+ at U7 = dso_local local_unnamed_addr global i32 0, align 4
+ at U8 = dso_local local_unnamed_addr global i32 0, align 4
+ at U9 = dso_local local_unnamed_addr global i32 0, align 4
+ at U10 = dso_local local_unnamed_addr global i32 0, align 4
+ at U11 = dso_local local_unnamed_addr global i32 0, align 4
+ at U12 = dso_local local_unnamed_addr global i32 0, align 4
+ at U13 = dso_local local_unnamed_addr global i32 0, align 4
+ at U14 = dso_local local_unnamed_addr global i32 0, align 4
+ at U15 = dso_local local_unnamed_addr global i32 0, align 4
+ at U16 = dso_local local_unnamed_addr global i32 0, align 4
+ at U17 = dso_local local_unnamed_addr global i32 0, align 4
+ at U18 = dso_local local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z5spillv() local_unnamed_addr #0 {
+; CHECK-LABEL: _Z5spillv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl D0(%rip), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl D1(%rip), %ecx
+; CHECK-NEXT:    movl D2(%rip), %edx
+; CHECK-NEXT:    movl D3(%rip), %esi
+; CHECK-NEXT:    movl D4(%rip), %edi
+; CHECK-NEXT:    movl D5(%rip), %r8d
+; CHECK-NEXT:    movl D6(%rip), %r9d
+; CHECK-NEXT:    movl D7(%rip), %r10d
+; CHECK-NEXT:    movl D8(%rip), %r11d
+; CHECK-NEXT:    movl D9(%rip), %ebx
+; CHECK-NEXT:    movl D10(%rip), %ebp
+; CHECK-NEXT:    movl D11(%rip), %r14d
+; CHECK-NEXT:    movl D12(%rip), %r15d
+; CHECK-NEXT:    movl D13(%rip), %r12d
+; CHECK-NEXT:    movl D14(%rip), %r13d
+; CHECK-NEXT:    movl D15(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    movl D16(%rip), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl D17(%rip), %eax
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    movl D18(%rip), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, U0(%rip)
+; CHECK-NEXT:    movl %ecx, U1(%rip)
+; CHECK-NEXT:    movl %edx, U2(%rip)
+; CHECK-NEXT:    movl %esi, U3(%rip)
+; CHECK-NEXT:    movl %edi, U4(%rip)
+; CHECK-NEXT:    movl %r8d, U5(%rip)
+; CHECK-NEXT:    movl %r9d, U6(%rip)
+; CHECK-NEXT:    movl %r10d, U7(%rip)
+; CHECK-NEXT:    movl %r11d, U8(%rip)
+; CHECK-NEXT:    movl %ebx, U9(%rip)
+; CHECK-NEXT:    movl %ebp, U10(%rip)
+; CHECK-NEXT:    movl %r14d, U11(%rip)
+; CHECK-NEXT:    movl %r15d, U12(%rip)
+; CHECK-NEXT:    movl %r12d, U13(%rip)
+; CHECK-NEXT:    movl %r13d, U14(%rip)
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    movl %eax, U15(%rip)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, U16(%rip)
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    movl %eax, U17(%rip)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl %eax, U18(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i32, i32* @D0
+  %1 = load i32, i32* @D1
+  %2 = load i32, i32* @D2
+  %3 = load i32, i32* @D3
+  %4 = load i32, i32* @D4
+  %5 = load i32, i32* @D5
+  %6 = load i32, i32* @D6
+  %7 = load i32, i32* @D7
+  %8 = load i32, i32* @D8
+  %9 = load i32, i32* @D9
+  %10 = load i32, i32* @D10
+  %11 = load i32, i32* @D11
+  %12 = load i32, i32* @D12
+  %13 = load i32, i32* @D13
+  %14 = load i32, i32* @D14
+  %15 = load i32, i32* @D15
+  %16 = load i32, i32* @D16
+  %17 = load i32, i32* @D17
+  %18 = load i32, i32* @D18
+  call void asm sideeffect "", "~{memory}"() #1
+  store i32 %0, i32* @U0
+  store i32 %1, i32* @U1
+  store i32 %2, i32* @U2
+  store i32 %3, i32* @U3
+  store i32 %4, i32* @U4
+  store i32 %5, i32* @U5
+  store i32 %6, i32* @U6
+  store i32 %7, i32* @U7
+  store i32 %8, i32* @U8
+  store i32 %9, i32* @U9
+  store i32 %10, i32* @U10
+  store i32 %11, i32* @U11
+  store i32 %12, i32* @U12
+  store i32 %13, i32* @U13
+  store i32 %14, i32* @U14
+  store i32 %15, i32* @U15
+  store i32 %16, i32* @U16
+  store i32 %17, i32* @U17
+  store i32 %18, i32* @U18
+  ret void
+}
+
+attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind }
+
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll
new file mode 100644
index 00000000000000..652d077a66b280
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+
+; End-to-end check that Spill2Reg works with 64-bit registers.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at D0 = dso_local local_unnamed_addr global i64 0, align 4
+ at D1 = dso_local local_unnamed_addr global i64 0, align 4
+ at D2 = dso_local local_unnamed_addr global i64 0, align 4
+ at D3 = dso_local local_unnamed_addr global i64 0, align 4
+ at D4 = dso_local local_unnamed_addr global i64 0, align 4
+ at D5 = dso_local local_unnamed_addr global i64 0, align 4
+ at D6 = dso_local local_unnamed_addr global i64 0, align 4
+ at D7 = dso_local local_unnamed_addr global i64 0, align 4
+ at D8 = dso_local local_unnamed_addr global i64 0, align 4
+ at D9 = dso_local local_unnamed_addr global i64 0, align 4
+ at D10 = dso_local local_unnamed_addr global i64 0, align 4
+ at D11 = dso_local local_unnamed_addr global i64 0, align 4
+ at D12 = dso_local local_unnamed_addr global i64 0, align 4
+ at D13 = dso_local local_unnamed_addr global i64 0, align 4
+ at D14 = dso_local local_unnamed_addr global i64 0, align 4
+ at D15 = dso_local local_unnamed_addr global i64 0, align 4
+ at D16 = dso_local local_unnamed_addr global i64 0, align 4
+ at D17 = dso_local local_unnamed_addr global i64 0, align 4
+ at D18 = dso_local local_unnamed_addr global i64 0, align 4
+ at U0 = dso_local local_unnamed_addr global i64 0, align 4
+ at U1 = dso_local local_unnamed_addr global i64 0, align 4
+ at U2 = dso_local local_unnamed_addr global i64 0, align 4
+ at U3 = dso_local local_unnamed_addr global i64 0, align 4
+ at U4 = dso_local local_unnamed_addr global i64 0, align 4
+ at U5 = dso_local local_unnamed_addr global i64 0, align 4
+ at U6 = dso_local local_unnamed_addr global i64 0, align 4
+ at U7 = dso_local local_unnamed_addr global i64 0, align 4
+ at U8 = dso_local local_unnamed_addr global i64 0, align 4
+ at U9 = dso_local local_unnamed_addr global i64 0, align 4
+ at U10 = dso_local local_unnamed_addr global i64 0, align 4
+ at U11 = dso_local local_unnamed_addr global i64 0, align 4
+ at U12 = dso_local local_unnamed_addr global i64 0, align 4
+ at U13 = dso_local local_unnamed_addr global i64 0, align 4
+ at U14 = dso_local local_unnamed_addr global i64 0, align 4
+ at U15 = dso_local local_unnamed_addr global i64 0, align 4
+ at U16 = dso_local local_unnamed_addr global i64 0, align 4
+ at U17 = dso_local local_unnamed_addr global i64 0, align 4
+ at U18 = dso_local local_unnamed_addr global i64 0, align 4
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z5spillv() local_unnamed_addr #0 {
+; CHECK-LABEL: _Z5spillv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq D0(%rip), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq D1(%rip), %rcx
+; CHECK-NEXT:    movq D2(%rip), %rdx
+; CHECK-NEXT:    movq D3(%rip), %rsi
+; CHECK-NEXT:    movq D4(%rip), %rdi
+; CHECK-NEXT:    movq D5(%rip), %r8
+; CHECK-NEXT:    movq D6(%rip), %r9
+; CHECK-NEXT:    movq D7(%rip), %r10
+; CHECK-NEXT:    movq D8(%rip), %r11
+; CHECK-NEXT:    movq D9(%rip), %rbx
+; CHECK-NEXT:    movq D10(%rip), %r14
+; CHECK-NEXT:    movq D11(%rip), %r15
+; CHECK-NEXT:    movq D12(%rip), %r12
+; CHECK-NEXT:    movq D13(%rip), %r13
+; CHECK-NEXT:    movq D14(%rip), %rbp
+; CHECK-NEXT:    movq D15(%rip), %rax
+; CHECK-NEXT:    movq %rax, %xmm0
+; CHECK-NEXT:    movq D16(%rip), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq D17(%rip), %rax
+; CHECK-NEXT:    movq %rax, %xmm1
+; CHECK-NEXT:    movq D18(%rip), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    movq %rax, U0(%rip)
+; CHECK-NEXT:    movq %rcx, U1(%rip)
+; CHECK-NEXT:    movq %rdx, U2(%rip)
+; CHECK-NEXT:    movq %rsi, U3(%rip)
+; CHECK-NEXT:    movq %rdi, U4(%rip)
+; CHECK-NEXT:    movq %r8, U5(%rip)
+; CHECK-NEXT:    movq %r9, U6(%rip)
+; CHECK-NEXT:    movq %r10, U7(%rip)
+; CHECK-NEXT:    movq %r11, U8(%rip)
+; CHECK-NEXT:    movq %rbx, U9(%rip)
+; CHECK-NEXT:    movq %r14, U10(%rip)
+; CHECK-NEXT:    movq %r15, U11(%rip)
+; CHECK-NEXT:    movq %r12, U12(%rip)
+; CHECK-NEXT:    movq %r13, U13(%rip)
+; CHECK-NEXT:    movq %rbp, U14(%rip)
+; CHECK-NEXT:    movq %xmm0, %rax
+; CHECK-NEXT:    movq %rax, U15(%rip)
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    movq %rax, U16(%rip)
+; CHECK-NEXT:    movq %xmm1, %rax
+; CHECK-NEXT:    movq %rax, U17(%rip)
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    movq %rax, U18(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i64, i64* @D0
+  %1 = load i64, i64* @D1
+  %2 = load i64, i64* @D2
+  %3 = load i64, i64* @D3
+  %4 = load i64, i64* @D4
+  %5 = load i64, i64* @D5
+  %6 = load i64, i64* @D6
+  %7 = load i64, i64* @D7
+  %8 = load i64, i64* @D8
+  %9 = load i64, i64* @D9
+  %10 = load i64, i64* @D10
+  %11 = load i64, i64* @D11
+  %12 = load i64, i64* @D12
+  %13 = load i64, i64* @D13
+  %14 = load i64, i64* @D14
+  %15 = load i64, i64* @D15
+  %16 = load i64, i64* @D16
+  %17 = load i64, i64* @D17
+  %18 = load i64, i64* @D18
+  call void asm sideeffect "", "~{memory}"() #1
+  store i64 %0, i64* @U0
+  store i64 %1, i64* @U1
+  store i64 %2, i64* @U2
+  store i64 %3, i64* @U3
+  store i64 %4, i64* @U4
+  store i64 %5, i64* @U5
+  store i64 %6, i64* @U6
+  store i64 %7, i64* @U7
+  store i64 %8, i64* @U8
+  store i64 %9, i64* @U9
+  store i64 %10, i64* @U10
+  store i64 %11, i64* @U11
+  store i64 %12, i64* @U12
+  store i64 %13, i64* @U13
+  store i64 %14, i64* @U14
+  store i64 %15, i64* @U15
+  store i64 %16, i64* @U16
+  store i64 %17, i64* @U17
+  store i64 %18, i64* @U18
+  ret void
+}
+
+attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
new file mode 100644
index 00000000000000..cd22eb43c07e72
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+
+; End-to-end check that Spill2Reg works with 8-bit registers.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at D0 = dso_local local_unnamed_addr global i8 0, align 4
+ at D1 = dso_local local_unnamed_addr global i8 0, align 4
+ at D2 = dso_local local_unnamed_addr global i8 0, align 4
+ at D3 = dso_local local_unnamed_addr global i8 0, align 4
+ at D4 = dso_local local_unnamed_addr global i8 0, align 4
+ at D5 = dso_local local_unnamed_addr global i8 0, align 4
+ at D6 = dso_local local_unnamed_addr global i8 0, align 4
+ at D7 = dso_local local_unnamed_addr global i8 0, align 4
+ at D8 = dso_local local_unnamed_addr global i8 0, align 4
+ at D9 = dso_local local_unnamed_addr global i8 0, align 4
+ at D10 = dso_local local_unnamed_addr global i8 0, align 4
+ at D11 = dso_local local_unnamed_addr global i8 0, align 4
+ at D12 = dso_local local_unnamed_addr global i8 0, align 4
+ at D13 = dso_local local_unnamed_addr global i8 0, align 4
+ at D14 = dso_local local_unnamed_addr global i8 0, align 4
+ at D15 = dso_local local_unnamed_addr global i8 0, align 4
+ at D16 = dso_local local_unnamed_addr global i8 0, align 4
+ at D17 = dso_local local_unnamed_addr global i8 0, align 4
+ at D18 = dso_local local_unnamed_addr global i8 0, align 4
+ at U0 = dso_local local_unnamed_addr global i8 0, align 4
+ at U1 = dso_local local_unnamed_addr global i8 0, align 4
+ at U2 = dso_local local_unnamed_addr global i8 0, align 4
+ at U3 = dso_local local_unnamed_addr global i8 0, align 4
+ at U4 = dso_local local_unnamed_addr global i8 0, align 4
+ at U5 = dso_local local_unnamed_addr global i8 0, align 4
+ at U6 = dso_local local_unnamed_addr global i8 0, align 4
+ at U7 = dso_local local_unnamed_addr global i8 0, align 4
+ at U8 = dso_local local_unnamed_addr global i8 0, align 4
+ at U9 = dso_local local_unnamed_addr global i8 0, align 4
+ at U10 = dso_local local_unnamed_addr global i8 0, align 4
+ at U11 = dso_local local_unnamed_addr global i8 0, align 4
+ at U12 = dso_local local_unnamed_addr global i8 0, align 4
+ at U13 = dso_local local_unnamed_addr global i8 0, align 4
+ at U14 = dso_local local_unnamed_addr global i8 0, align 4
+ at U15 = dso_local local_unnamed_addr global i8 0, align 4
+ at U16 = dso_local local_unnamed_addr global i8 0, align 4
+ at U17 = dso_local local_unnamed_addr global i8 0, align 4
+ at U18 = dso_local local_unnamed_addr global i8 0, align 4
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z5spillv() local_unnamed_addr #0 {
+; CHECK-LABEL: _Z5spillv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movzbl D0(%rip), %eax
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl D1(%rip), %ecx
+; CHECK-NEXT:    movzbl D2(%rip), %edx
+; CHECK-NEXT:    movzbl D3(%rip), %esi
+; CHECK-NEXT:    movzbl D4(%rip), %edi
+; CHECK-NEXT:    movzbl D5(%rip), %r8d
+; CHECK-NEXT:    movzbl D6(%rip), %r9d
+; CHECK-NEXT:    movzbl D7(%rip), %r10d
+; CHECK-NEXT:    movzbl D8(%rip), %r11d
+; CHECK-NEXT:    movzbl D9(%rip), %ebx
+; CHECK-NEXT:    movzbl D10(%rip), %ebp
+; CHECK-NEXT:    movzbl D11(%rip), %r14d
+; CHECK-NEXT:    movzbl D12(%rip), %r15d
+; CHECK-NEXT:    movzbl D13(%rip), %r12d
+; CHECK-NEXT:    movzbl D14(%rip), %r13d
+; CHECK-NEXT:    movzbl D15(%rip), %eax
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl D16(%rip), %eax
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl D17(%rip), %eax
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl D18(%rip), %eax
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    movb %al, U0(%rip)
+; CHECK-NEXT:    movb %cl, U1(%rip)
+; CHECK-NEXT:    movb %dl, U2(%rip)
+; CHECK-NEXT:    movb %sil, U3(%rip)
+; CHECK-NEXT:    movb %dil, U4(%rip)
+; CHECK-NEXT:    movb %r8b, U5(%rip)
+; CHECK-NEXT:    movb %r9b, U6(%rip)
+; CHECK-NEXT:    movb %r10b, U7(%rip)
+; CHECK-NEXT:    movb %r11b, U8(%rip)
+; CHECK-NEXT:    movb %bl, U9(%rip)
+; CHECK-NEXT:    movb %bpl, U10(%rip)
+; CHECK-NEXT:    movb %r14b, U11(%rip)
+; CHECK-NEXT:    movb %r15b, U12(%rip)
+; CHECK-NEXT:    movb %r12b, U13(%rip)
+; CHECK-NEXT:    movb %r13b, U14(%rip)
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    movb %al, U15(%rip)
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    movb %al, U16(%rip)
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    movb %al, U17(%rip)
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    movb %al, U18(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i8, i8* @D0
+  %1 = load i8, i8* @D1
+  %2 = load i8, i8* @D2
+  %3 = load i8, i8* @D3
+  %4 = load i8, i8* @D4
+  %5 = load i8, i8* @D5
+  %6 = load i8, i8* @D6
+  %7 = load i8, i8* @D7
+  %8 = load i8, i8* @D8
+  %9 = load i8, i8* @D9
+  %10 = load i8, i8* @D10
+  %11 = load i8, i8* @D11
+  %12 = load i8, i8* @D12
+  %13 = load i8, i8* @D13
+  %14 = load i8, i8* @D14
+  %15 = load i8, i8* @D15
+  %16 = load i8, i8* @D16
+  %17 = load i8, i8* @D17
+  %18 = load i8, i8* @D18
+  call void asm sideeffect "", "~{memory}"() #1
+  store i8 %0, i8* @U0
+  store i8 %1, i8* @U1
+  store i8 %2, i8* @U2
+  store i8 %3, i8* @U3
+  store i8 %4, i8* @U4
+  store i8 %5, i8* @U5
+  store i8 %6, i8* @U6
+  store i8 %7, i8* @U7
+  store i8 %8, i8* @U8
+  store i8 %9, i8* @U9
+  store i8 %10, i8* @U10
+  store i8 %11, i8* @U11
+  store i8 %12, i8* @U12
+  store i8 %13, i8* @U13
+  store i8 %14, i8* @U14
+  store i8 %15, i8* @U15
+  store i8 %16, i8* @U16
+  store i8 %17, i8* @U17
+  store i8 %18, i8* @U18
+  ret void
+}
+
+attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_all_live.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_all_live.mir
new file mode 100644
index 00000000000000..093c8bf2b34308
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_all_live.mir
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that spill2reg is not applied when all xmm registers are live.
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, $xmm8, $xmm9, $xmm10, $xmm11, $xmm12, $xmm13, $xmm14, $xmm15
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+  ; CHECK-NEXT:   MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+  ; CHECK-NEXT:   $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, $xmm8, $xmm9, $xmm10, $xmm11, $xmm12, $xmm13, $xmm14, $xmm15
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   RET 0
+  bb.0:
+    successors: %bb.1
+    liveins: $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, $xmm8, $xmm9, $xmm10, $xmm11, $xmm12, $xmm13, $xmm14, $xmm15
+
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    JMP_1 %bb.1
+
+  bb.1:
+    liveins: $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, $xmm8, $xmm9, $xmm10, $xmm11, $xmm12, $xmm13, $xmm14, $xmm15
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_call.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_call.mir
new file mode 100644
index 00000000000000..0279936a575e93
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_call.mir
@@ -0,0 +1,46 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that reg liveness works correctly through calls
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  @Cond = dso_local local_unnamed_addr global i32 0, align 4
+  declare void @foo()
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+ ; CHECK-LABEL:  bb.0:
+ ; CHECK-NEXT:    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+ ; CHECK-NEXT:    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+ ; CHECK-NEXT:    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp
+ ; CHECK-NEXT:    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+ ; CHECK-NEXT:    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+ ; CHECK-NEXT:    RET 0
+
+  bb.0:
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+
+    ; The call may touch all xmm regs, so disable spill2reg across it
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp
+
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_defined_in_bb.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_defined_in_bb.mir
new file mode 100644
index 00000000000000..7cb35df1974d8e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_defined_in_bb.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that spill2reg won't use $xmm0 if it is defined by some other instr
+# in the BB and is live. Instead it should use $xmm1 and $xmm2.
+
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $eax
+    ; CHECK-LABEL: name: func
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; CHECK-NEXT: $xmm1 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    ; CHECK-NEXT: $xmm2 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm1
+    ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm2
+    ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; CHECK-NEXT: RET $eax
+    $xmm0 = MOVDI2PDIrr $eax
+
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    $eax = MOVPDI2DIrr $xmm0
+    RET $eax
+
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_livein.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_livein.mir
new file mode 100644
index 00000000000000..88f903b27493a4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_livein.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that spill2reg doesn't use $xmm0 and $xmm2 if they are livein.
+# Instead it should use $xmm1 and $xmm3.
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $xmm0, $xmm2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm1 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+  ; CHECK-NEXT:   $xmm3 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm1
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm3
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $xmm0, $xmm2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   RET 0
+  bb.0:
+    successors: %bb.1
+    liveins: $xmm0, $xmm2
+
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    JMP_1 %bb.1
+
+  bb.1:
+    liveins: $xmm0, $xmm2
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_reload_mbb_and_intermediate.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_reload_mbb_and_intermediate.mir
new file mode 100644
index 00000000000000..18d07a53a5d752
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_reload_mbb_and_intermediate.mir
@@ -0,0 +1,75 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that the traversal works correctly when an MBB (like BB0) contains
+# a reload (reload1), but it is also an intermediate block in the path from
+# another reload (reload2) to a spill (spill1).
+# In the following example we need to make sure that we don't skip the
+# instructions of BB1 under reload1 during the bottom-up traversal from
+# reload2 to the spill.
+
+# BB0:
+#   [stack.0] = ... ; spill
+# BB1:
+#   ... = [stack.0] ; reload1
+#   call            ; clobbers xmm regs
+# BB2:
+#   ... = [stack.0] ; reload2
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  declare void @foo()
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   RET 0
+
+
+
+  bb.0:
+    successors: %bb.1
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; spill
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+
+  bb.1:
+    successors: %bb.2
+    ; reload1
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+
+    ; The call clobbers all xmm regs
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp
+    JMP_1 %bb.2
+
+  bb.2:
+    ; reload2
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir b/llvm/test/CodeGen/X86/spill2reg_simple_2.mir
index a8cfa501436d79..419971af7d074d 100644
--- a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_2.mir
@@ -32,10 +32,10 @@ body:             |
     ; CHECK: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
     ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
     ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
-    ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $xmm1 = MOVDI2PDIrr $eax
     ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
     ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
-    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm1
     ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
     ; CHECK-NEXT: RET 0
     $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_3.mir b/llvm/test/CodeGen/X86/spill2reg_simple_3.mir
index 5a87fb562bb2c0..3705503b9c8ff1 100644
--- a/llvm/test/CodeGen/X86/spill2reg_simple_3.mir
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_3.mir
@@ -67,66 +67,66 @@ body:             |
     ; MEM0: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
     ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
-    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $xmm5 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D2, $noreg :: (dereferenceable load (s32) from @D2)
-    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $xmm2 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D3, $noreg :: (dereferenceable load (s32) from @D3)
-    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $xmm6 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D4, $noreg :: (dereferenceable load (s32) from @D4)
-    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $xmm3 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D5, $noreg :: (dereferenceable load (s32) from @D5)
-    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $xmm7 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D6, $noreg :: (dereferenceable load (s32) from @D6)
-    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $xmm4 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D7, $noreg :: (dereferenceable load (s32) from @D7)
-    ; MEM0-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM0-NEXT: $xmm1 = MOVDI2PDIrr $eax
     ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
-    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm5
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
-    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm2
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U2, $noreg, killed renamable $eax :: (store (s32) into @U2)
-    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm6
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U3, $noreg, killed renamable $eax :: (store (s32) into @U3)
-    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm3
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U4, $noreg, killed renamable $eax :: (store (s32) into @U4)
-    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm7
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U5, $noreg, killed renamable $eax :: (store (s32) into @U5)
-    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm4
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U6, $noreg, killed renamable $eax :: (store (s32) into @U6)
-    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM0-NEXT: $eax = MOVPDI2DIrr $xmm1
     ; MEM0-NEXT: MOV32mr $rip, 1, $noreg, @U7, $noreg, killed renamable $eax :: (store (s32) into @U7)
     ; MEM0-NEXT: RET 0
     ; MEM50-LABEL: name: func
     ; MEM50: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
     ; MEM50-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
     ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
-    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $xmm3 = MOVDI2PDIrr $eax
     ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D2, $noreg :: (dereferenceable load (s32) from @D2)
     ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
     ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D3, $noreg :: (dereferenceable load (s32) from @D3)
-    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $xmm4 = MOVDI2PDIrr $eax
     ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D4, $noreg :: (dereferenceable load (s32) from @D4)
-    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $xmm1 = MOVDI2PDIrr $eax
     ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D5, $noreg :: (dereferenceable load (s32) from @D5)
-    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $xmm5 = MOVDI2PDIrr $eax
     ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D6, $noreg :: (dereferenceable load (s32) from @D6)
-    ; MEM50-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM50-NEXT: $xmm2 = MOVDI2PDIrr $eax
     ; MEM50-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D7, $noreg :: (dereferenceable load (s32) from @D7)
     ; MEM50-NEXT: MOV32mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.7)
     ; MEM50-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
-    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm3
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
     ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U2, $noreg, killed renamable $eax :: (store (s32) into @U2)
-    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm4
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U3, $noreg, killed renamable $eax :: (store (s32) into @U3)
-    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm1
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U4, $noreg, killed renamable $eax :: (store (s32) into @U4)
-    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm5
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U5, $noreg, killed renamable $eax :: (store (s32) into @U5)
-    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM50-NEXT: $eax = MOVPDI2DIrr $xmm2
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U6, $noreg, killed renamable $eax :: (store (s32) into @U6)
     ; MEM50-NEXT: $eax = MOV32rm %stack.7, 1, $noreg, 0, $noreg :: (load (s32) from %stack.7)
     ; MEM50-NEXT: MOV32mr $rip, 1, $noreg, @U7, $noreg, killed renamable $eax :: (store (s32) into @U7)
@@ -141,7 +141,7 @@ body:             |
     ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D3, $noreg :: (dereferenceable load (s32) from @D3)
     ; MEM100-NEXT: MOV32mr %stack.3, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.3)
     ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D4, $noreg :: (dereferenceable load (s32) from @D4)
-    ; MEM100-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; MEM100-NEXT: $xmm1 = MOVDI2PDIrr $eax
     ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D5, $noreg :: (dereferenceable load (s32) from @D5)
     ; MEM100-NEXT: MOV32mr %stack.5, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.5)
     ; MEM100-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D6, $noreg :: (dereferenceable load (s32) from @D6)
@@ -156,7 +156,7 @@ body:             |
     ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U2, $noreg, killed renamable $eax :: (store (s32) into @U2)
     ; MEM100-NEXT: $eax = MOV32rm %stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %stack.3)
     ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U3, $noreg, killed renamable $eax :: (store (s32) into @U3)
-    ; MEM100-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; MEM100-NEXT: $eax = MOVPDI2DIrr $xmm1
     ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U4, $noreg, killed renamable $eax :: (store (s32) into @U4)
     ; MEM100-NEXT: $eax = MOV32rm %stack.5, 1, $noreg, 0, $noreg :: (load (s32) from %stack.5)
     ; MEM100-NEXT: MOV32mr $rip, 1, $noreg, @U5, $noreg, killed renamable $eax :: (store (s32) into @U5)

>From f674ebb2c1638c23b025080a86520c10a79c4a91 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 02:29:00 -0800
Subject: [PATCH 7/9] [Spill2Reg] MBB live-ins are now being updated

This patch implements updates for the MBB live-ins due to the newly introduced
instructions emitted by spill2reg.
This is required for correct tracking of live register usage.

Original review: https://reviews.llvm.org/D118304
---
 llvm/lib/CodeGen/Spill2Reg.cpp                |  98 ++++++++++++++-
 .../spill2reg_liveins_reload_before_spill.mir |  74 ++++++++++++
 .../X86/spill2reg_liveins_spill_override.mir  | 103 ++++++++++++++++
 .../CodeGen/X86/spill2reg_liveregs_bbs.mir    |  73 +++++++++++
 .../CodeGen/X86/spill2reg_liveregs_cross.mir  | 114 ++++++++++++++++++
 .../spill2reg_liveregs_vec_under_reload.mir   |  90 ++++++++++++++
 6 files changed, 550 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveins_reload_before_spill.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveins_spill_override.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_bbs.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_cross.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_liveregs_vec_under_reload.mir

diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
index 29d9f9509dd7f7..ee5aec529da451 100644
--- a/llvm/lib/CodeGen/Spill2Reg.cpp
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -66,11 +66,43 @@ class Spill2Reg : public MachineFunctionPass {
       /// differs across accesses to the same stack slot.
       unsigned SpillBits = 0;
 #ifndef NDEBUG
-      LLVM_DUMP_METHOD void dump() const;
+      LLVM_DUMP_METHOD virtual void dump() const;
+      virtual ~MIData() {}
+#endif
+    };
+
+    struct MIDataWithLiveIn : public MIData {
+      MIDataWithLiveIn(MachineInstr *MI, const MachineOperand *MO,
+                       unsigned SpillBits)
+          : MIData(MI, MO, SpillBits) {}
+      /// We set this to false to mark the vector register associated to this
+      /// reload as definitely not live-in. This is useful in blocks with both
+      /// spill and reload of the same stack slot, like in the example:
+      /// \verbatim
+      ///  bb:
+      ///    spill %stack.0
+      ///    reload %stack.0
+      /// \endverbatim
+      /// This information is used during `updateLiveIns()`. We are collecting
+      /// this information during `collectSpillsAndReloads()` because we are
+      /// already walking through the code there. Otherwise we would need to
+      /// walk throught the code again in `updateLiveIns()` just to check for
+      /// other spills in the block, which would waste compilation time.
+      bool IsLiveIn = true;
+#ifndef NDEBUG
+      LLVM_DUMP_METHOD virtual void dump() const override;
 #endif
     };
     SmallVector<MIData, 1> Spills;
-    SmallVector<MIData, 1> Reloads;
+    SmallVector<MIDataWithLiveIn, 1> Reloads;
+
+    /// \Returns the register class of the register being spilled.
+    const TargetRegisterClass *
+    getSpilledRegClass(const TargetInstrInfo *TII,
+                       const TargetRegisterInfo *TRI) const {
+      auto Reg0 = Spills.front().MO->getReg();
+      return TII->getVectorRegisterClassForSpill2Reg(TRI, Reg0);
+    }
 
     /// \Returns the register class of the register being spilled.
     const TargetRegisterClass *
@@ -195,6 +227,15 @@ void Spill2Reg::collectSpillsAndReloads() {
         }
         unsigned SpillBits = TRI->getRegSizeInBits(MO->getReg(), *MRI);
         Entry.Spills.emplace_back(Spill, MO, SpillBits);
+
+        // If any of the reloads collected so far is in the same MBB then mark
+        // it as non live-in. This is used in `updateLiveIns()` where we update
+        // the liveins of MBBs to include the new vector register. Doing this
+        // now avoids an MBB walk in `updateLiveIns()` which should save
+        // compilation time.
+        for (auto &MID : Entry.Reloads)
+          if (MID.MI->getParent() == &MBB)
+            MID.IsLiveIn = false;
       } else if (const MachineOperand *MO =
                      TII->isLoadFromStackSlotMO(MI, StackSlot)) {
         MachineInstr *Reload = &MI;
@@ -265,6 +306,49 @@ static void DFS(MachineBasicBlock *MBB, DenseSet<MachineBasicBlock *> &Visited,
   for (MachineBasicBlock *PredMBB : MBB->predecessors())
     DFS(PredMBB, Visited, Fn);
 }
+
+void Spill2Reg::updateLiveIns(StackSlotDataEntry &Entry, MCRegister VectorReg) {
+  // Collect the parent MBBs of Spills for fast lookup.
+  DenseSet<MachineBasicBlock *> SpillMBBs(Entry.Spills.size());
+  DenseSet<MachineInstr *> Spills(Entry.Spills.size());
+  for (const auto &Data : Entry.Spills) {
+    SpillMBBs.insert(Data.MI->getParent());
+    Spills.insert(Data.MI);
+  }
+
+  auto AddLiveInIfRequired = [VectorReg, &SpillMBBs](MachineBasicBlock *MBB) {
+    // If there is a spill in this MBB then we don't need to add a live-in.
+    // This works even if there is a reload above the spill, like this:
+    //   reload stack.0
+    //   spill  stack.0
+    // because the live-in due to the reload is handled at a separate walk.
+    if (SpillMBBs.count(MBB))
+      // Return true to stop the recursion.
+      return true;
+    // If there are no spills in this block then the register is live-in.
+    if (!MBB->isLiveIn(VectorReg))
+      MBB->addLiveIn(VectorReg);
+    // Return false to continue the recursion.
+    return false;
+  };
+
+  // Update the MBB live-ins. These are used for the live regs calculation.
+  DenseSet<MachineBasicBlock *> Visited;
+  for (const auto &ReloadData : Entry.Reloads) {
+    MachineInstr *Reload = ReloadData.MI;
+    MachineBasicBlock *MBB = Reload->getParent();
+    // From a previous walk in MBB we know whether the reload is live-in, or
+    // whether the value comes from an earlier spill in the same MBB.
+    if (!ReloadData.IsLiveIn)
+      continue;
+    if (!MBB->isLiveIn(VectorReg))
+      MBB->addLiveIn(VectorReg);
+
+    for (MachineBasicBlock *PredMBB : Reload->getParent()->predecessors())
+      DFS(PredMBB, Visited, AddLiveInIfRequired);
+  }
+}
+
 // Replace stack-based spills/reloads with register-based ones.
 void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
                                     Register VectorReg) {
@@ -408,6 +492,12 @@ void Spill2Reg::generateCode() {
     if (!PhysVectorRegOpt)
       continue;
 
+    // Update the MBB live-ins. These are used for the live regs calculation.
+    // Collect the parent MBBs of Spills for fast lookup.
+    // NOTE: We do that before calling replaceStackWithReg() because it will
+    // remove the spill/reload instructions from Entry.
+    updateLiveIns(Entry, *PhysVectorRegOpt);
+
     // Replace stack accesses with register accesses.
     replaceStackWithReg(Entry, *PhysVectorRegOpt);
 
@@ -437,6 +527,10 @@ void Spill2Reg::StackSlotDataEntry::MIData::dump() const {
   dbgs() << "  (" << *MO << ") " << *MI;
 }
 
+void Spill2Reg::StackSlotDataEntry::MIDataWithLiveIn::dump() const {
+  dbgs() << "  (" << *MO << ") " << *MI << " IsLiveIn: " << IsLiveIn;
+}
+
 void Spill2Reg::StackSlotDataEntry::dump() const {
   dbgs().indent(DumpInd) << "Disable: " << Disable << "\n";
   dbgs().indent(DumpInd) << "Spills:\n";
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveins_reload_before_spill.mir b/llvm/test/CodeGen/X86/spill2reg_liveins_reload_before_spill.mir
new file mode 100644
index 00000000000000..6101ffb94a0938
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveins_reload_before_spill.mir
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that the liveins of a MBB get updated correctly if it contains a reload
+# before a spill.
+
+# bb.0:
+#  spill stack.0
+#  JMP bb.1
+#
+# bb.1:
+#  reload stack.0
+#  spill stack.0
+#  JMP bb.2
+#
+# bb.2:
+#  reload stack.0
+
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $xmm0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $xmm0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   RET 0
+
+
+
+  bb.0:
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; Spill stack.0
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    JMP_1 %bb.1
+
+  bb.1:
+    ; Reload stack.0
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    ; Spill stack.0
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    JMP_1 %bb.2
+
+  bb.2:
+    ; Reload
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveins_spill_override.mir b/llvm/test/CodeGen/X86/spill2reg_liveins_spill_override.mir
new file mode 100644
index 00000000000000..6807c7f0eb07bc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveins_spill_override.mir
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+
+# Checks that spills that are overriden by other spills (like spill0 by spill1)
+# are not marked as live-in.
+# Also checks live reg tracking with spills-reload pairs separated by an instr
+# that clobbers the xmm registers.
+
+# BB0:
+#   [stack.0] = ... ; spill0
+# BB1:
+#   [stack.0] = ... ; spill1
+#   ... = [stack.0] ; reload1
+#   call            ; clobbers xmm regs
+#   [stack.0] = ... ; spill2
+#   ... = [stack.0] ; reload2
+# BB2:
+#   ... = [stack.0] ; reload3
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  declare void @foo()
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $xmm0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   RET 0
+
+
+
+  bb.0:
+    successors: %bb.1
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    ; spill0
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+
+  bb.1:
+    successors: %bb.2
+    ; spill1
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+
+    ; reload1
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+
+    ; The call clobbers all xmm regs
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp
+
+    ; spill2
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+
+    ; reload2
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    JMP_1 %bb.2
+
+  bb.2:
+    ; reload3
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+...
+
+
+
+
+
+
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_bbs.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_bbs.mir
new file mode 100644
index 00000000000000..34616407d26644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_bbs.mir
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that live reg tracking works correctly when one live range spans
+# multiple blocks and the other spans a single block.
+# BB0:
+#   [stack.0] = ...
+# BB1:
+#   [stack.1] = ...
+#   ... = [stack.1]
+# BB2:
+#   ... = [stack.0]
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $xmm0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+  ; CHECK-NEXT:   $xmm1 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm1
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $xmm0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   RET 0
+
+
+
+  bb.0:
+    successors: %bb.1
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+
+  bb.1:
+    successors: %bb.2
+    $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+    $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    JMP_1 %bb.2
+
+  bb.2:
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_cross.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_cross.mir
new file mode 100644
index 00000000000000..437577e8f1f71d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_cross.mir
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that reg liveness works correctly when the live ranges cross like this:
+# BB0 BB1
+#  \  /
+#   BB2
+#  /  \
+# BB3 BB4
+#
+# BB0:
+#   [stack.0] = ...
+#   JMP BB2
+# BB1:
+#   [stack.1] = ...
+#   JMP BB2
+# BB2:
+#   ...
+#   JMP BB3 or BB4
+# BB3:
+#   ... = [stack.0]
+# BB4:
+#   ... = [stack.1]
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @D1 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U1 = dso_local local_unnamed_addr global i32 0, align 4
+  @Cond = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $xmm1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $xmm0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+  ; CHECK-NEXT:   $xmm1 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $xmm0, $xmm1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @Cond, $noreg :: (dereferenceable load (s32) from @Cond)
+  ; CHECK-NEXT:   TEST32rr $eax, $eax, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.3, 2, implicit $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   liveins: $xmm0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   RET 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   liveins: $xmm1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm1
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+  ; CHECK-NEXT:   RET 0
+
+
+
+
+
+  bb.0:
+    successors: %bb.2
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    JMP_1 %bb.2
+
+  bb.1:
+    successors: %bb.2
+    $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1)
+    MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1)
+    JMP_1 %bb.2
+
+  bb.2:
+    $eax = MOV32rm $rip, 1, $noreg, @Cond, $noreg :: (dereferenceable load (s32) from @Cond)
+    TEST32rr $eax, $eax, implicit-def $eflags
+    JCC_1 %bb.3, 2, implicit $eflags
+    JMP_1 %bb.4
+
+  bb.3:
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+
+  bb.4:
+    $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1)
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_vec_under_reload.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_vec_under_reload.mir
new file mode 100644
index 00000000000000..bba8f01c24ae13
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_vec_under_reload.mir
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Check that reg liveness does not get confused by vector reg accesses under
+# a reload.
+#
+#   BB0
+#  /  \
+# BB1 BB2
+#
+# BB0:
+#   [stack.0] = ...
+#   xmm0 = ...
+#   ... = xmm0
+#   JMP BB1 or BB2
+# BB1:
+#   ... = [stack.0]
+# BB2:
+#   ... = [stack.0]
+#   xmm0 = ...
+
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  @Cond = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: func
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+  ; CHECK-NEXT:   $xmm1 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOV32rm $rip, 1, $noreg, @Cond, $noreg :: (dereferenceable load (s32) from @Cond)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   TEST32rr $eax, $eax, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.1, 2, implicit $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $xmm1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm1
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   RET 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $xmm1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm1
+  ; CHECK-NEXT:   MOV32mr $rip, 1, $noreg, @U0, $noreg, $eax :: (store (s32) into @U0)
+  ; CHECK-NEXT:   $xmm0 = MOVDI2PDIrr $eax
+  ; CHECK-NEXT:   $eax = MOVPDI2DIrr $xmm0
+  ; CHECK-NEXT:   RET 0
+
+
+
+  bb.0:
+    successors: %bb.1, %bb.2
+    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+    MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0)
+    $eax = MOV32rm $rip, 1, $noreg, @Cond, $noreg :: (dereferenceable load (s32) from @Cond)
+    ; xmm0 used here
+    $xmm0 = MOVDI2PDIrr $eax
+    $eax = MOVPDI2DIrr $xmm0
+
+    TEST32rr $eax, $eax, implicit-def $eflags
+    JCC_1 %bb.1, 2, implicit $eflags
+    JMP_1 %bb.2
+
+  bb.1:
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+    RET 0
+
+  bb.2:
+    $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    MOV32mr $rip, 1, $noreg, @U0, $noreg, $eax :: (store (s32) into @U0)
+    ; xmm0 used here too, under the reload
+    $xmm0 = MOVDI2PDIrr $eax
+    $eax = MOVPDI2DIrr $xmm0
+    RET 0
+...

>From 375887a00bf52f27a9c1ae45d2f119cacc917455 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 02:32:36 -0800
Subject: [PATCH 8/9] [Spill2Reg] Adds code generation for 8/16bit
 spill/reloads in x86

This patch adds support for 8/16 bit values in x86.

Original review: https://reviews.llvm.org/D118305
---
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 46 +++++++++++++++++++
 llvm/lib/Target/X86/X86InstrInfo.h            |  4 ++
 .../CodeGen/X86/spill2reg_end_to_end_16bit.ll | 16 +++----
 .../CodeGen/X86/spill2reg_end_to_end_8bit.ll  | 16 +++----
 .../CodeGen/X86/spill2reg_simple_1_16bit.mir  | 39 ++++++++++++++++
 .../CodeGen/X86/spill2reg_simple_1_8bit.mir   | 39 ++++++++++++++++
 6 files changed, 144 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir
 create mode 100644 llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 8f08f8bff75980..5e9a6ed2fb5164 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10954,6 +10954,8 @@ bool X86InstrInfo::isLegalToSpill2Reg(Register Reg,
   switch (unsigned Bits = TRI->getRegSizeInBits(Reg, *MRI)) {
   case 64:
   case 32:
+  case 16:
+  case 8:
     return true;
   }
   return false;
@@ -11049,6 +11051,8 @@ bool X86InstrInfo::isSpill2RegProfitable(const MachineInstr *MI,
 
 static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert) {
   switch (Bits) {
+  case 8:
+  case 16:
   case 32:
     return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
   case 64:
@@ -11058,6 +11062,36 @@ static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert) {
   }
 }
 
+/// \Returns the subreg index for a getting a subregister of \p SubregBits from
+/// a register of \p RegBits.
+static unsigned spill2RegGetSubregIdx(unsigned RegBits, unsigned SubregBits) {
+  assert(RegBits > SubregBits && "From expected to cover To");
+  switch (SubregBits) {
+  case 32:
+    return X86::sub_32bit;
+  case 16:
+    return X86::sub_16bit;
+  case 8:
+    return X86::sub_8bit;
+  default:
+    llvm_unreachable("FIXME");
+  }
+}
+
+std::optional<MCRegister>
+X86InstrInfo::getMovdCompatibleReg(MCRegister OldReg, uint32_t OldRegBits,
+                                   const TargetRegisterInfo *TRI) const {
+  if (OldRegBits != 8 && OldRegBits != 16)
+    return std::nullopt;
+  // The register class of the register that movd can handle.
+  const TargetRegisterClass *NewRegClass =
+      TRI->getRegClass(X86::GR32RegClassID);
+  unsigned NewRegBits = TRI->getRegSizeInBits(*NewRegClass);
+  unsigned SubIdx = spill2RegGetSubregIdx(NewRegBits, OldRegBits);
+  MCRegister NewReg = TRI->getMatchingSuperReg(OldReg, SubIdx, NewRegClass);
+  return NewReg;
+}
+
 MachineInstr *X86InstrInfo::spill2RegInsertToVectorReg(
     Register DstReg, Register SrcReg, int OperationBits, MachineBasicBlock *MBB,
     MachineBasicBlock::iterator InsertBeforeIt,
@@ -11066,6 +11100,12 @@ MachineInstr *X86InstrInfo::spill2RegInsertToVectorReg(
   unsigned InsertOpcode =
       getInsertOrExtractOpcode(OperationBits, true /*insert*/);
   const MCInstrDesc &InsertMCID = get(InsertOpcode);
+  // `movd` does not support 8/16 bit operands. Instead, we use a 32-bit
+  // register. For example:
+  //   $al = ...
+  //   $xmm0 = MOVPDI2DIrr $eax
+  if (auto NewReg = getMovdCompatibleReg(SrcReg, OperationBits, TRI))
+    SrcReg = *NewReg;
   MachineInstr *InsertMI =
       BuildMI(*MBB, InsertBeforeIt, DL, InsertMCID, DstReg).addReg(SrcReg);
   return InsertMI;
@@ -11079,6 +11119,12 @@ MachineInstr *X86InstrInfo::spill2RegExtractFromVectorReg(
   unsigned ExtractOpcode =
       getInsertOrExtractOpcode(OperationBits, false /*extract*/);
   const MCInstrDesc &ExtractMCID = get(ExtractOpcode);
+  // `movd` does not support 8/16 bit operands. Instead, we use a 32-bit
+  // register. For example:
+  //   $eax = MOVPDI2DIrr $xmm0
+  //   ... = $al
+  if (auto NewReg = getMovdCompatibleReg(DstReg, OperationBits, TRI))
+    DstReg = *NewReg;
   MachineInstr *ExtractMI =
       BuildMI(*InsertMBB, InsertBeforeIt, DL, ExtractMCID, DstReg)
           .addReg(SrcReg);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 3d67a5e4522e71..0044c6765ca2df 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -756,6 +756,10 @@ class X86InstrInfo final : public X86GenInstrInfo {
                              const TargetRegisterInfo *TRI,
                              const MachineRegisterInfo *MRI) const override;
 
+  std::optional<MCRegister>
+  getMovdCompatibleReg(MCRegister OldReg, uint32_t OldRegBits,
+                       const TargetRegisterInfo *TRI) const;
+
   MachineInstr *
   spill2RegInsertToVectorReg(Register DstReg, Register SrcReg,
                              int OperationBits, MachineBasicBlock *MBB,
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
index 9d454d015fd9e2..b97fc092c3c68f 100644
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
@@ -67,7 +67,7 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movzwl D0(%rip), %eax
+; CHECK-NEXT:    movw D0(%rip), %ax
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; CHECK-NEXT:    movzwl D1(%rip), %ecx
 ; CHECK-NEXT:    movzwl D2(%rip), %edx
@@ -83,12 +83,12 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    movzwl D12(%rip), %r15d
 ; CHECK-NEXT:    movzwl D13(%rip), %r12d
 ; CHECK-NEXT:    movzwl D14(%rip), %r13d
-; CHECK-NEXT:    movzwl D15(%rip), %eax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movzwl D16(%rip), %eax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movzwl D17(%rip), %eax
+; CHECK-NEXT:    movw D15(%rip), %ax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    movw D16(%rip), %ax
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw D17(%rip), %ax
+; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    movzwl D18(%rip), %eax
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
 ; CHECK-NEXT:    #APP
@@ -109,11 +109,11 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    movw %r15w, U12(%rip)
 ; CHECK-NEXT:    movw %r12w, U13(%rip)
 ; CHECK-NEXT:    movw %r13w, U14(%rip)
-; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    movw %ax, U15(%rip)
 ; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
 ; CHECK-NEXT:    movw %ax, U16(%rip)
-; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movd %xmm1, %eax
 ; CHECK-NEXT:    movw %ax, U17(%rip)
 ; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
 ; CHECK-NEXT:    movw %ax, U18(%rip)
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
index cd22eb43c07e72..8883c9f3835914 100644
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
@@ -67,7 +67,7 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movzbl D0(%rip), %eax
+; CHECK-NEXT:    movb D0(%rip), %al
 ; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    movzbl D1(%rip), %ecx
 ; CHECK-NEXT:    movzbl D2(%rip), %edx
@@ -83,12 +83,12 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    movzbl D12(%rip), %r15d
 ; CHECK-NEXT:    movzbl D13(%rip), %r12d
 ; CHECK-NEXT:    movzbl D14(%rip), %r13d
-; CHECK-NEXT:    movzbl D15(%rip), %eax
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movzbl D16(%rip), %eax
-; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT:    movzbl D17(%rip), %eax
+; CHECK-NEXT:    movb D15(%rip), %al
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    movb D16(%rip), %al
 ; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb D17(%rip), %al
+; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    movzbl D18(%rip), %eax
 ; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    #APP
@@ -109,11 +109,11 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    movb %r15b, U12(%rip)
 ; CHECK-NEXT:    movb %r12b, U13(%rip)
 ; CHECK-NEXT:    movb %r13b, U14(%rip)
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    movb %al, U15(%rip)
 ; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-NEXT:    movb %al, U16(%rip)
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    movd %xmm1, %eax
 ; CHECK-NEXT:    movb %al, U17(%rip)
 ; CHECK-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-NEXT:    movb %al, U18(%rip)
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir
new file mode 100644
index 00000000000000..65da23411e2ab1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Simple test with a single 16-bit spill-reload pair:
+#   spill stack.0
+#   reload stack.0
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+
+  bb.0:
+    ; spill
+    ; CHECK-LABEL: name: func
+    ; CHECK: $ax = MOV16rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s16) from @D0)
+    ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; CHECK-NEXT: MOV16mr $rip, 1, $noreg, @U0, $noreg, killed renamable $ax :: (store (s16) into @U0)
+    ; CHECK-NEXT: RET 0
+    $ax = MOV16rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s16) from @D0)
+    MOV16mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $ax :: (store (s16) into %stack.0)
+    ; reload
+    $ax = MOV16rm %stack.0, 1, $noreg, 0, $noreg :: (load (s16) from %stack.0)
+    MOV16mr $rip, 1, $noreg, @U0, $noreg, killed renamable $ax :: (store (s16) into @U0)
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir
new file mode 100644
index 00000000000000..a34a4c5748c014
--- /dev/null
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+
+# Simple test with a single 8-bit spill-reload pair:
+#   spill stack.0
+#   reload stack.0
+
+--- |
+  @D0 = dso_local local_unnamed_addr global i32 0, align 4
+  @U0 = dso_local local_unnamed_addr global i32 0, align 4
+  define void @func() { ret void }
+...
+---
+name: func
+alignment:       16
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo: {}
+body:             |
+
+  bb.0:
+    ; spill
+    ; CHECK-LABEL: name: func
+    ; CHECK: $al = MOV8rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s8) from @D0)
+    ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax
+    ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
+    ; CHECK-NEXT: MOV8mr $rip, 1, $noreg, @U0, $noreg, killed renamable $al :: (store (s8) into @U0)
+    ; CHECK-NEXT: RET 0
+    $al = MOV8rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s8) from @D0)
+    MOV8mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $al :: (store (s8) into %stack.0)
+    ; reload
+    $al = MOV8rm %stack.0, 1, $noreg, 0, $noreg :: (load (s8) from %stack.0)
+    MOV8mr $rip, 1, $noreg, @U0, $noreg, killed renamable $al :: (store (s8) into @U0)
+    RET 0
+...

>From 04a5281dbf6cde86c6c8807c29b6674801563d74 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas at google.com>
Date: Fri, 20 Dec 2024 02:42:15 -0800
Subject: [PATCH 9/9] [Spill2Reg] Use AVX opcodes when available

This patch updates the vector spill/reload instructions to use the AVX
opcodes by default if the targets supports it. This can be turned off
with the -spill2reg-no-avx flag.

Original review: https://reviews.llvm.org/D118951
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 19 ++---
 llvm/lib/CodeGen/Spill2Reg.cpp                | 14 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 39 ++++++---
 llvm/lib/Target/X86/X86InstrInfo.h            |  7 +-
 .../CodeGen/X86/spill2reg_end_to_end_16bit.ll | 85 +++++++++++++++++++
 .../CodeGen/X86/spill2reg_end_to_end_32bit.ll | 85 +++++++++++++++++++
 .../CodeGen/X86/spill2reg_end_to_end_64bit.ll | 85 +++++++++++++++++++
 .../CodeGen/X86/spill2reg_end_to_end_8bit.ll  | 85 +++++++++++++++++++
 .../CodeGen/X86/spill2reg_simple_1_16bit.mir  |  7 ++
 .../CodeGen/X86/spill2reg_simple_1_32bit.mir  |  8 ++
 .../CodeGen/X86/spill2reg_simple_1_64bit.mir  | 14 +++
 .../CodeGen/X86/spill2reg_simple_1_8bit.mir   |  7 ++
 12 files changed, 427 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 33a0ed3c23c160..c1c0fb557edd84 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2307,6 +2307,7 @@ class TargetInstrInfo : public MCInstrInfo {
 
   virtual const TargetRegisterClass *
   getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
+                                     const TargetSubtargetInfo *STI,
                                      Register SpilledReg) const {
     llvm_unreachable(
         "Target didn't implement "
@@ -2322,21 +2323,19 @@ class TargetInstrInfo : public MCInstrInfo {
   }
 
   /// Inserts \p SrcReg into the first lane of \p DstReg.
-  virtual MachineInstr *
-  spill2RegInsertToVectorReg(Register DstReg, Register SrcReg,
-                             int OperationBits, MachineBasicBlock *MBB,
-                             MachineBasicBlock::iterator InsertBeforeIt,
-                             const TargetRegisterInfo *TRI) const {
+  virtual MachineInstr *spill2RegInsertToVectorReg(
+      Register DstReg, Register SrcReg, int OperationBits,
+      MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertBeforeIt,
+      const TargetRegisterInfo *TRI, const TargetSubtargetInfo *STI) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::spill2RegInsertToVectorReg!");
   }
 
   /// Extracts the first lane of \p SrcReg into \p DstReg.
-  virtual MachineInstr *
-  spill2RegExtractFromVectorReg(Register DstReg, Register SrcReg,
-                                int OperationBits, MachineBasicBlock *InsertMBB,
-                                MachineBasicBlock::iterator InsertBeforeIt,
-                                const TargetRegisterInfo *TRI) const {
+  virtual MachineInstr *spill2RegExtractFromVectorReg(
+      Register DstReg, Register SrcReg, int OperationBits,
+      MachineBasicBlock *InsertMBB, MachineBasicBlock::iterator InsertBeforeIt,
+      const TargetRegisterInfo *TRI, const TargetSubtargetInfo *STI) const {
     llvm_unreachable("Target didn't implement "
                      "TargetInstrInfo::spill2RegExtractFromVectorReg!");
   }
diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp
index ee5aec529da451..a427e5df2b64ee 100644
--- a/llvm/lib/CodeGen/Spill2Reg.cpp
+++ b/llvm/lib/CodeGen/Spill2Reg.cpp
@@ -99,9 +99,10 @@ class Spill2Reg : public MachineFunctionPass {
     /// \Returns the register class of the register being spilled.
     const TargetRegisterClass *
     getSpilledRegClass(const TargetInstrInfo *TII,
-                       const TargetRegisterInfo *TRI) const {
+                       const TargetRegisterInfo *TRI,
+                       const TargetSubtargetInfo *STI) const {
       auto Reg0 = Spills.front().MO->getReg();
-      return TII->getVectorRegisterClassForSpill2Reg(TRI, Reg0);
+      return TII->getVectorRegisterClassForSpill2Reg(TRI, STI, Reg0);
     }
 
     /// \Returns the register class of the register being spilled.
@@ -359,7 +360,7 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
 
     TII->spill2RegInsertToVectorReg(
         VectorReg, OldReg, SpillData.SpillBits, StackSpill->getParent(),
-        /*InsertBeforeIt=*/StackSpill->getIterator(), TRI);
+        /*InsertBeforeIt=*/StackSpill->getIterator(), TRI, &MF->getSubtarget());
 
     // Mark VectorReg as live in the instr's BB.
     LRUs[StackSpill->getParent()].addReg(VectorReg);
@@ -376,7 +377,8 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
 
     TII->spill2RegExtractFromVectorReg(
         OldReg, VectorReg, ReloadData.SpillBits, StackReload->getParent(),
-        /*InsertBeforeIt=*/StackReload->getIterator(), TRI);
+        /*InsertBeforeIt=*/StackReload->getIterator(), TRI,
+        &MF->getSubtarget());
 
     // Mark VectorReg as live in the instr's BB.
     LRUs[StackReload->getParent()].addReg(VectorReg);
@@ -487,8 +489,8 @@ void Spill2Reg::generateCode() {
     calculateLiveRegs(Entry, LRU);
 
     // Look for a physical register that in LRU.
-    std::optional<MCRegister> PhysVectorRegOpt =
-        tryGetFreePhysicalReg(Entry.getSpilledRegClass(TII, TRI), LRU);
+    std::optional<MCRegister> PhysVectorRegOpt = tryGetFreePhysicalReg(
+        Entry.getSpilledRegClass(TII, TRI, &MF->getSubtarget()), LRU);
     if (!PhysVectorRegOpt)
       continue;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5e9a6ed2fb5164..d37099158ff18a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -94,6 +94,10 @@ static cl::opt<int> Spill2RegExplorationDst(
     cl::desc("When checking for profitability, explore nearby instructions "
              "at this maximum distance."));
 
+static cl::opt<bool> Spill2RegNoAVX(
+    "spill2reg-no-avx", cl::Hidden, cl::init(false),
+    cl::desc("Don't use AVX instructions even if the targets supports them."));
+
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
 
@@ -10967,11 +10971,18 @@ bool X86InstrInfo::targetSupportsSpill2Reg(
   return X86STI->hasSSE41();
 }
 
+static inline bool useAVX(const TargetSubtargetInfo *STI) {
+  const X86Subtarget *X86STI = static_cast<const X86Subtarget *>(STI);
+  bool UseAVX = X86STI->hasAVX() && !Spill2RegNoAVX;
+  return UseAVX;
+}
+
 const TargetRegisterClass *
 X86InstrInfo::getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
+                                                 const TargetSubtargetInfo *STI,
                                                  Register SpilledReg) const {
-  const TargetRegisterClass *VecRegClass =
-      TRI->getRegClass(X86::VR128RegClassID);
+  const TargetRegisterClass *VecRegClass = TRI->getRegClass(
+      useAVX(STI) ? X86::VR128XRegClassID : X86::VR128RegClassID);
   return VecRegClass;
 }
 
@@ -11049,14 +11060,22 @@ bool X86InstrInfo::isSpill2RegProfitable(const MachineInstr *MI,
   return MemHeuristic && VecHeuristic;
 }
 
-static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert) {
+static unsigned getInsertOrExtractOpcode(unsigned Bits, bool Insert,
+                                         const TargetSubtargetInfo *STI) {
+  bool UseAVX = useAVX(STI);
   switch (Bits) {
   case 8:
   case 16:
   case 32:
-    return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
+    if (UseAVX)
+      return Insert ? X86::VMOVDI2PDIZrr : X86::VMOVPDI2DIZrr;
+    else
+      return Insert ? X86::MOVDI2PDIrr : X86::MOVPDI2DIrr;
   case 64:
-    return Insert ? X86::MOV64toPQIrr : X86::MOVPQIto64rr;
+    if (UseAVX)
+      return Insert ? X86::VMOV64toPQIZrr : X86::VMOVPQIto64Zrr;
+    else
+      return Insert ? X86::MOV64toPQIrr : X86::MOVPQIto64rr;
   default:
     llvm_unreachable("Unsupported bits");
   }
@@ -11094,11 +11113,11 @@ X86InstrInfo::getMovdCompatibleReg(MCRegister OldReg, uint32_t OldRegBits,
 
 MachineInstr *X86InstrInfo::spill2RegInsertToVectorReg(
     Register DstReg, Register SrcReg, int OperationBits, MachineBasicBlock *MBB,
-    MachineBasicBlock::iterator InsertBeforeIt,
-    const TargetRegisterInfo *TRI) const {
+    MachineBasicBlock::iterator InsertBeforeIt, const TargetRegisterInfo *TRI,
+    const TargetSubtargetInfo *STI) const {
   DebugLoc DL;
   unsigned InsertOpcode =
-      getInsertOrExtractOpcode(OperationBits, true /*insert*/);
+      getInsertOrExtractOpcode(OperationBits, true /*insert*/, STI);
   const MCInstrDesc &InsertMCID = get(InsertOpcode);
   // `movd` does not support 8/16 bit operands. Instead, we use a 32-bit
   // register. For example:
@@ -11114,10 +11133,10 @@ MachineInstr *X86InstrInfo::spill2RegInsertToVectorReg(
 MachineInstr *X86InstrInfo::spill2RegExtractFromVectorReg(
     Register DstReg, Register SrcReg, int OperationBits,
     MachineBasicBlock *InsertMBB, MachineBasicBlock::iterator InsertBeforeIt,
-    const TargetRegisterInfo *TRI) const {
+    const TargetRegisterInfo *TRI, const TargetSubtargetInfo *STI) const {
   DebugLoc DL;
   unsigned ExtractOpcode =
-      getInsertOrExtractOpcode(OperationBits, false /*extract*/);
+      getInsertOrExtractOpcode(OperationBits, false /*extract*/, STI);
   const MCInstrDesc &ExtractMCID = get(ExtractOpcode);
   // `movd` does not support 8/16 bit operands. Instead, we use a 32-bit
   // register. For example:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 0044c6765ca2df..4a00a33b345b34 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -750,6 +750,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
 
   const TargetRegisterClass *
   getVectorRegisterClassForSpill2Reg(const TargetRegisterInfo *TRI,
+                                     const TargetSubtargetInfo *STI,
                                      Register SpilledReg) const override;
 
   bool isSpill2RegProfitable(const MachineInstr *MI,
@@ -764,13 +765,15 @@ class X86InstrInfo final : public X86GenInstrInfo {
   spill2RegInsertToVectorReg(Register DstReg, Register SrcReg,
                              int OperationBits, MachineBasicBlock *MBB,
                              MachineBasicBlock::iterator InsertBeforeIt,
-                             const TargetRegisterInfo *TRI) const override;
+                             const TargetRegisterInfo *TRI,
+                             const TargetSubtargetInfo *STI) const override;
 
   MachineInstr *
   spill2RegExtractFromVectorReg(Register DstReg, Register SrcReg,
                                 int OperationBits, MachineBasicBlock *InsertMBB,
                                 MachineBasicBlock::iterator InsertBeforeIt,
-                                const TargetRegisterInfo *TRI) const override;
+                                const TargetRegisterInfo *TRI,
+                                const TargetSubtargetInfo *STI) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
index b97fc092c3c68f..7e696f05606339 100644
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; End-to-end check that Spill2Reg works with 16-bit registers.
 
@@ -130,6 +131,90 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+;
+; AVX-LABEL: _Z5spillv:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    pushq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    pushq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    pushq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 56
+; AVX-NEXT:    .cfi_offset %rbx, -56
+; AVX-NEXT:    .cfi_offset %r12, -48
+; AVX-NEXT:    .cfi_offset %r13, -40
+; AVX-NEXT:    .cfi_offset %r14, -32
+; AVX-NEXT:    .cfi_offset %r15, -24
+; AVX-NEXT:    .cfi_offset %rbp, -16
+; AVX-NEXT:    movw D0(%rip), %ax
+; AVX-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX-NEXT:    movzwl D1(%rip), %ecx
+; AVX-NEXT:    movzwl D2(%rip), %edx
+; AVX-NEXT:    movzwl D3(%rip), %esi
+; AVX-NEXT:    movzwl D4(%rip), %edi
+; AVX-NEXT:    movzwl D5(%rip), %r8d
+; AVX-NEXT:    movzwl D6(%rip), %r9d
+; AVX-NEXT:    movzwl D7(%rip), %r10d
+; AVX-NEXT:    movzwl D8(%rip), %r11d
+; AVX-NEXT:    movzwl D9(%rip), %ebx
+; AVX-NEXT:    movzwl D10(%rip), %ebp
+; AVX-NEXT:    movzwl D11(%rip), %r14d
+; AVX-NEXT:    movzwl D12(%rip), %r15d
+; AVX-NEXT:    movzwl D13(%rip), %r12d
+; AVX-NEXT:    movzwl D14(%rip), %r13d
+; AVX-NEXT:    movw D15(%rip), %ax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    movw D16(%rip), %ax
+; AVX-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX-NEXT:    movw D17(%rip), %ax
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    movzwl D18(%rip), %eax
+; AVX-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX-NEXT:    #APP
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX-NEXT:    movw %ax, U0(%rip)
+; AVX-NEXT:    movw %cx, U1(%rip)
+; AVX-NEXT:    movw %dx, U2(%rip)
+; AVX-NEXT:    movw %si, U3(%rip)
+; AVX-NEXT:    movw %di, U4(%rip)
+; AVX-NEXT:    movw %r8w, U5(%rip)
+; AVX-NEXT:    movw %r9w, U6(%rip)
+; AVX-NEXT:    movw %r10w, U7(%rip)
+; AVX-NEXT:    movw %r11w, U8(%rip)
+; AVX-NEXT:    movw %bx, U9(%rip)
+; AVX-NEXT:    movw %bp, U10(%rip)
+; AVX-NEXT:    movw %r14w, U11(%rip)
+; AVX-NEXT:    movw %r15w, U12(%rip)
+; AVX-NEXT:    movw %r12w, U13(%rip)
+; AVX-NEXT:    movw %r13w, U14(%rip)
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movw %ax, U15(%rip)
+; AVX-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX-NEXT:    movw %ax, U16(%rip)
+; AVX-NEXT:    vmovd %xmm1, %eax
+; AVX-NEXT:    movw %ax, U17(%rip)
+; AVX-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; AVX-NEXT:    movw %ax, U18(%rip)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    popq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    popq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    popq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
 entry:
   %0 = load i16, i16* @D0
   %1 = load i16, i16* @D1
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll
index 3f1811cb971f53..442cfc8f2e378c 100644
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck --check-prefix=AVX %s
 
 ; End-to-end check that Spill2Reg works with 32-bit registers.
 
@@ -130,6 +131,90 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+;
+; AVX-LABEL: _Z5spillv:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    pushq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    pushq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    pushq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 56
+; AVX-NEXT:    .cfi_offset %rbx, -56
+; AVX-NEXT:    .cfi_offset %r12, -48
+; AVX-NEXT:    .cfi_offset %r13, -40
+; AVX-NEXT:    .cfi_offset %r14, -32
+; AVX-NEXT:    .cfi_offset %r15, -24
+; AVX-NEXT:    .cfi_offset %rbp, -16
+; AVX-NEXT:    movl D0(%rip), %eax
+; AVX-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    movl D1(%rip), %ecx
+; AVX-NEXT:    movl D2(%rip), %edx
+; AVX-NEXT:    movl D3(%rip), %esi
+; AVX-NEXT:    movl D4(%rip), %edi
+; AVX-NEXT:    movl D5(%rip), %r8d
+; AVX-NEXT:    movl D6(%rip), %r9d
+; AVX-NEXT:    movl D7(%rip), %r10d
+; AVX-NEXT:    movl D8(%rip), %r11d
+; AVX-NEXT:    movl D9(%rip), %ebx
+; AVX-NEXT:    movl D10(%rip), %ebp
+; AVX-NEXT:    movl D11(%rip), %r14d
+; AVX-NEXT:    movl D12(%rip), %r15d
+; AVX-NEXT:    movl D13(%rip), %r12d
+; AVX-NEXT:    movl D14(%rip), %r13d
+; AVX-NEXT:    movl D15(%rip), %eax
+; AVX-NEXT:    movd %eax, %xmm0
+; AVX-NEXT:    movl D16(%rip), %eax
+; AVX-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    movl D17(%rip), %eax
+; AVX-NEXT:    movd %eax, %xmm1
+; AVX-NEXT:    movl D18(%rip), %eax
+; AVX-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    #APP
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX-NEXT:    movl %eax, U0(%rip)
+; AVX-NEXT:    movl %ecx, U1(%rip)
+; AVX-NEXT:    movl %edx, U2(%rip)
+; AVX-NEXT:    movl %esi, U3(%rip)
+; AVX-NEXT:    movl %edi, U4(%rip)
+; AVX-NEXT:    movl %r8d, U5(%rip)
+; AVX-NEXT:    movl %r9d, U6(%rip)
+; AVX-NEXT:    movl %r10d, U7(%rip)
+; AVX-NEXT:    movl %r11d, U8(%rip)
+; AVX-NEXT:    movl %ebx, U9(%rip)
+; AVX-NEXT:    movl %ebp, U10(%rip)
+; AVX-NEXT:    movl %r14d, U11(%rip)
+; AVX-NEXT:    movl %r15d, U12(%rip)
+; AVX-NEXT:    movl %r12d, U13(%rip)
+; AVX-NEXT:    movl %r13d, U14(%rip)
+; AVX-NEXT:    movd %xmm0, %eax
+; AVX-NEXT:    movl %eax, U15(%rip)
+; AVX-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX-NEXT:    movl %eax, U16(%rip)
+; AVX-NEXT:    movd %xmm1, %eax
+; AVX-NEXT:    movl %eax, U17(%rip)
+; AVX-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX-NEXT:    movl %eax, U18(%rip)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    popq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    popq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    popq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
 entry:
   %0 = load i32, i32* @D0
   %1 = load i32, i32* @D1
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll
index 652d077a66b280..00666a342db8b0 100644
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck --check-prefix=AVX %s
 
 ; End-to-end check that Spill2Reg works with 64-bit registers.
 
@@ -130,6 +131,90 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+;
+; AVX-LABEL: _Z5spillv:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    pushq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    pushq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    pushq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 56
+; AVX-NEXT:    .cfi_offset %rbx, -56
+; AVX-NEXT:    .cfi_offset %r12, -48
+; AVX-NEXT:    .cfi_offset %r13, -40
+; AVX-NEXT:    .cfi_offset %r14, -32
+; AVX-NEXT:    .cfi_offset %r15, -24
+; AVX-NEXT:    .cfi_offset %rbp, -16
+; AVX-NEXT:    movq D0(%rip), %rax
+; AVX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX-NEXT:    movq D1(%rip), %rcx
+; AVX-NEXT:    movq D2(%rip), %rdx
+; AVX-NEXT:    movq D3(%rip), %rsi
+; AVX-NEXT:    movq D4(%rip), %rdi
+; AVX-NEXT:    movq D5(%rip), %r8
+; AVX-NEXT:    movq D6(%rip), %r9
+; AVX-NEXT:    movq D7(%rip), %r10
+; AVX-NEXT:    movq D8(%rip), %r11
+; AVX-NEXT:    movq D9(%rip), %rbx
+; AVX-NEXT:    movq D10(%rip), %r14
+; AVX-NEXT:    movq D11(%rip), %r15
+; AVX-NEXT:    movq D12(%rip), %r12
+; AVX-NEXT:    movq D13(%rip), %r13
+; AVX-NEXT:    movq D14(%rip), %rbp
+; AVX-NEXT:    movq D15(%rip), %rax
+; AVX-NEXT:    movq %rax, %xmm0
+; AVX-NEXT:    movq D16(%rip), %rax
+; AVX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX-NEXT:    movq D17(%rip), %rax
+; AVX-NEXT:    movq %rax, %xmm1
+; AVX-NEXT:    movq D18(%rip), %rax
+; AVX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX-NEXT:    #APP
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX-NEXT:    movq %rax, U0(%rip)
+; AVX-NEXT:    movq %rcx, U1(%rip)
+; AVX-NEXT:    movq %rdx, U2(%rip)
+; AVX-NEXT:    movq %rsi, U3(%rip)
+; AVX-NEXT:    movq %rdi, U4(%rip)
+; AVX-NEXT:    movq %r8, U5(%rip)
+; AVX-NEXT:    movq %r9, U6(%rip)
+; AVX-NEXT:    movq %r10, U7(%rip)
+; AVX-NEXT:    movq %r11, U8(%rip)
+; AVX-NEXT:    movq %rbx, U9(%rip)
+; AVX-NEXT:    movq %r14, U10(%rip)
+; AVX-NEXT:    movq %r15, U11(%rip)
+; AVX-NEXT:    movq %r12, U12(%rip)
+; AVX-NEXT:    movq %r13, U13(%rip)
+; AVX-NEXT:    movq %rbp, U14(%rip)
+; AVX-NEXT:    movq %xmm0, %rax
+; AVX-NEXT:    movq %rax, U15(%rip)
+; AVX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX-NEXT:    movq %rax, U16(%rip)
+; AVX-NEXT:    movq %xmm1, %rax
+; AVX-NEXT:    movq %rax, U17(%rip)
+; AVX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX-NEXT:    movq %rax, U18(%rip)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    popq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    popq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    popq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
 entry:
   %0 = load i64, i64* @D0
   %1 = load i64, i64* @D1
diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
index 8883c9f3835914..8a890a7683e644 100644
--- a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
+++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; End-to-end check that Spill2Reg works with 8-bit registers.
 
@@ -130,6 +131,90 @@ define dso_local void @_Z5spillv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
+;
+; AVX-LABEL: _Z5spillv:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    pushq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    pushq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    pushq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 56
+; AVX-NEXT:    .cfi_offset %rbx, -56
+; AVX-NEXT:    .cfi_offset %r12, -48
+; AVX-NEXT:    .cfi_offset %r13, -40
+; AVX-NEXT:    .cfi_offset %r14, -32
+; AVX-NEXT:    .cfi_offset %r15, -24
+; AVX-NEXT:    .cfi_offset %rbp, -16
+; AVX-NEXT:    movb D0(%rip), %al
+; AVX-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX-NEXT:    movzbl D1(%rip), %ecx
+; AVX-NEXT:    movzbl D2(%rip), %edx
+; AVX-NEXT:    movzbl D3(%rip), %esi
+; AVX-NEXT:    movzbl D4(%rip), %edi
+; AVX-NEXT:    movzbl D5(%rip), %r8d
+; AVX-NEXT:    movzbl D6(%rip), %r9d
+; AVX-NEXT:    movzbl D7(%rip), %r10d
+; AVX-NEXT:    movzbl D8(%rip), %r11d
+; AVX-NEXT:    movzbl D9(%rip), %ebx
+; AVX-NEXT:    movzbl D10(%rip), %ebp
+; AVX-NEXT:    movzbl D11(%rip), %r14d
+; AVX-NEXT:    movzbl D12(%rip), %r15d
+; AVX-NEXT:    movzbl D13(%rip), %r12d
+; AVX-NEXT:    movzbl D14(%rip), %r13d
+; AVX-NEXT:    movb D15(%rip), %al
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    movb D16(%rip), %al
+; AVX-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX-NEXT:    movb D17(%rip), %al
+; AVX-NEXT:    vmovd %eax, %xmm1
+; AVX-NEXT:    movzbl D18(%rip), %eax
+; AVX-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX-NEXT:    #APP
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX-NEXT:    movb %al, U0(%rip)
+; AVX-NEXT:    movb %cl, U1(%rip)
+; AVX-NEXT:    movb %dl, U2(%rip)
+; AVX-NEXT:    movb %sil, U3(%rip)
+; AVX-NEXT:    movb %dil, U4(%rip)
+; AVX-NEXT:    movb %r8b, U5(%rip)
+; AVX-NEXT:    movb %r9b, U6(%rip)
+; AVX-NEXT:    movb %r10b, U7(%rip)
+; AVX-NEXT:    movb %r11b, U8(%rip)
+; AVX-NEXT:    movb %bl, U9(%rip)
+; AVX-NEXT:    movb %bpl, U10(%rip)
+; AVX-NEXT:    movb %r14b, U11(%rip)
+; AVX-NEXT:    movb %r15b, U12(%rip)
+; AVX-NEXT:    movb %r12b, U13(%rip)
+; AVX-NEXT:    movb %r13b, U14(%rip)
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    movb %al, U15(%rip)
+; AVX-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX-NEXT:    movb %al, U16(%rip)
+; AVX-NEXT:    vmovd %xmm1, %eax
+; AVX-NEXT:    movb %al, U17(%rip)
+; AVX-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX-NEXT:    movb %al, U18(%rip)
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    popq %r12
+; AVX-NEXT:    .cfi_def_cfa_offset 40
+; AVX-NEXT:    popq %r13
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    .cfi_def_cfa_offset 24
+; AVX-NEXT:    popq %r15
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
 entry:
   %0 = load i8, i8* @D0
   %1 = load i8, i8* @D1
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir
index 65da23411e2ab1..8a0eacce46e49c 100644
--- a/llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_16bit.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=AVX %s
 
 # Simple test with a single 16-bit spill-reload pair:
 #   spill stack.0
@@ -30,6 +31,12 @@ body:             |
     ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
     ; CHECK-NEXT: MOV16mr $rip, 1, $noreg, @U0, $noreg, killed renamable $ax :: (store (s16) into @U0)
     ; CHECK-NEXT: RET 0
+    ; AVX-LABEL: name: func
+    ; AVX: $ax = MOV16rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s16) from @D0)
+    ; AVX-NEXT: $xmm0 = VMOVDI2PDIZrr $eax
+    ; AVX-NEXT: $eax = VMOVPDI2DIZrr $xmm0
+    ; AVX-NEXT: MOV16mr $rip, 1, $noreg, @U0, $noreg, killed renamable $ax :: (store (s16) into @U0)
+    ; AVX-NEXT: RET 0
     $ax = MOV16rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s16) from @D0)
     MOV16mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $ax :: (store (s16) into %stack.0)
     ; reload
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir
index ddcf84134d89f7..8bc2d870c929b1 100644
--- a/llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_32bit.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
 # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=-sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=NOSSE %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=AVX %s
 
 # Simple test with a single spill-reload pair (32-bit version):
 #   spill stack.0
@@ -24,6 +25,13 @@ machineFunctionInfo: {}
 body:             |
 
 
+ ; AVX-LABEL:  bb.0:
+ ; AVX-NEXT:    $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0)
+ ; AVX-NEXT:    $xmm0 = VMOVDI2PDIZrr $eax
+ ; AVX-NEXT:    $eax = VMOVPDI2DIZrr $xmm0
+ ; AVX-NEXT:    MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0)
+ ; AVX-NEXT:    RET 0
+
   bb.0:
     ; spill
     ; CHECK-LABEL: name: func
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir
index 5d5baa730a5983..e62f5d015c7280 100644
--- a/llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_64bit.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
 # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=-sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=NOSSE %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=AVX %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -spill2reg-no-avx -mattr=+avx --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=NO_AVX_FLAG %s
 
 # Simple test with a single spill-reload pair (64-bit version):
 #   spill stack.0
@@ -38,6 +40,18 @@ body:             |
     ; NOSSE-NEXT: $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
     ; NOSSE-NEXT: MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
     ; NOSSE-NEXT: RET 0
+    ; AVX-LABEL: name: func
+    ; AVX: $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    ; AVX-NEXT: $xmm0 = VMOV64toPQIZrr $rax
+    ; AVX-NEXT: $rax = VMOVPQIto64Zrr $xmm0
+    ; AVX-NEXT: MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    ; AVX-NEXT: RET 0
+    ; NO_AVX_FLAG-LABEL: name: func
+    ; NO_AVX_FLAG: $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
+    ; NO_AVX_FLAG-NEXT: $xmm0 = MOV64toPQIrr $rax
+    ; NO_AVX_FLAG-NEXT: $rax = MOVPQIto64rr $xmm0
+    ; NO_AVX_FLAG-NEXT: MOV64mr $rip, 1, $noreg, @U0, $noreg, killed renamable $rax :: (store (s64) into @U0)
+    ; NO_AVX_FLAG-NEXT: RET 0
     $rax = MOV64rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s64) from @D0)
     MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
     ; reload
diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir b/llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir
index a34a4c5748c014..be70bf981873eb 100644
--- a/llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir
+++ b/llvm/test/CodeGen/X86/spill2reg_simple_1_8bit.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s
+# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+avx --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck --check-prefix=AVX %s
 
 # Simple test with a single 8-bit spill-reload pair:
 #   spill stack.0
@@ -30,6 +31,12 @@ body:             |
     ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0
     ; CHECK-NEXT: MOV8mr $rip, 1, $noreg, @U0, $noreg, killed renamable $al :: (store (s8) into @U0)
     ; CHECK-NEXT: RET 0
+    ; AVX-LABEL: name: func
+    ; AVX: $al = MOV8rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s8) from @D0)
+    ; AVX-NEXT: $xmm0 = VMOVDI2PDIZrr $eax
+    ; AVX-NEXT: $eax = VMOVPDI2DIZrr $xmm0
+    ; AVX-NEXT: MOV8mr $rip, 1, $noreg, @U0, $noreg, killed renamable $al :: (store (s8) into @U0)
+    ; AVX-NEXT: RET 0
     $al = MOV8rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s8) from @D0)
     MOV8mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $al :: (store (s8) into %stack.0)
     ; reload