[llvm] [StackColoring] Change the StackColoring logic + enables it to handle spills (PR #143800)

via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 22 05:16:31 PDT 2025


https://github.com/Ralender updated https://github.com/llvm/llvm-project/pull/143800

>From f9c024d7c9b217db49518831cd75c5ead260f742 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Tue, 20 May 2025 22:28:40 +0200
Subject: [PATCH 01/19] [NFC][StackColoring] Remove unused member for
 StackColoring

---
 llvm/lib/CodeGen/StackColoring.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 0f93822d9792b..8946c7cd44058 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -402,9 +402,6 @@ class StackColoring {
   using LivenessMap = DenseMap<const MachineBasicBlock *, BlockLifetimeInfo>;
   LivenessMap BlockLiveness;
 
-  /// Maps serial numbers to basic blocks.
-  DenseMap<const MachineBasicBlock *, int> BasicBlocks;
-
   /// Maps basic blocks to a serial number.
   SmallVector<const MachineBasicBlock *, 8> BasicBlockNumbering;
 
@@ -727,7 +724,6 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   // deterministic numbering.
   for (MachineBasicBlock *MBB : depth_first(MF)) {
     // Assign a serial number to this basic block.
-    BasicBlocks[MBB] = BasicBlockNumbering.size();
     BasicBlockNumbering.push_back(MBB);
 
     // Keep a reference to avoid repeated lookups.
@@ -1211,7 +1207,6 @@ bool StackColoring::run(MachineFunction &Func) {
   MF = &Func;
   MFI = &MF->getFrameInfo();
   BlockLiveness.clear();
-  BasicBlocks.clear();
   BasicBlockNumbering.clear();
   Markers.clear();
   Intervals.clear();

>From 48f34ec2d6fc5f9fe21c6828d4dbc764958779d6 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Tue, 20 May 2025 23:23:46 +0200
Subject: [PATCH 02/19] [NFC][StackColoring] Use LiveRange instead of
 LiveInterval in StackColoring

---
 llvm/lib/CodeGen/StackColoring.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 8946c7cd44058..609ee3bbc369c 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -407,7 +407,7 @@ class StackColoring {
 
   /// Maps slots to their use interval. Outside of this interval, slots
   /// values are either dead or `undef` and they will not be written to.
-  SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals;
+  SmallVector<std::unique_ptr<LiveRange>, 16> Intervals;
 
   /// Maps slots to the points where they can become in-use.
   SmallVector<SmallVector<SlotIndex, 4>, 16> LiveStarts;
@@ -1035,7 +1035,7 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         // validating the instructions.
         if (!I.isDebugInstr() && TouchesMemory && ProtectFromEscapedAllocas) {
           SlotIndex Index = Indexes->getInstructionIndex(I);
-          const LiveInterval *Interval = &*Intervals[FromSlot];
+          const LiveRange *Interval = &*Intervals[FromSlot];
           assert(Interval->find(Index) != Interval->end() &&
                  "Found instruction usage outside of live range.");
         }
@@ -1155,7 +1155,7 @@ void StackColoring::removeInvalidSlotRanges() {
 
         // Check that the used slot is inside the calculated lifetime range.
         // If it is not, warn about it and invalidate the range.
-        LiveInterval *Interval = &*Intervals[Slot];
+        LiveRange *Interval = &*Intervals[Slot];
         SlotIndex Index = Indexes->getInstructionIndex(I);
         if (Interval->find(Index) == Interval->end()) {
           Interval->clear();
@@ -1247,7 +1247,7 @@ bool StackColoring::run(MachineFunction &Func) {
   }
 
   for (unsigned i=0; i < NumSlots; ++i) {
-    std::unique_ptr<LiveInterval> LI(new LiveInterval(i, 0));
+    std::unique_ptr<LiveRange> LI(new LiveRange());
     LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
     Intervals.push_back(std::move(LI));
     SortedSlots.push_back(i);
@@ -1317,8 +1317,8 @@ bool StackColoring::run(MachineFunction &Func) {
         if (MFI->getStackID(FirstSlot) != MFI->getStackID(SecondSlot))
           continue;
 
-        LiveInterval *First = &*Intervals[FirstSlot];
-        LiveInterval *Second = &*Intervals[SecondSlot];
+        LiveRange *First = &*Intervals[FirstSlot];
+        LiveRange *Second = &*Intervals[SecondSlot];
         auto &FirstS = LiveStarts[FirstSlot];
         auto &SecondS = LiveStarts[SecondSlot];
         assert(!First->empty() && !Second->empty() && "Found an empty range");

>From 833849843a2d0cb25222e652d2bcf6a43f9e3f7d Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Tue, 10 Jun 2025 22:08:43 +0200
Subject: [PATCH 03/19] [NFC] make constructor explicit LiveRange

Without it it can lead to crazy situtation, when passing a LiveRange* to a function expecting a const LiveRange&
the LiveRange* is converted to bool and a new empty LiveRange is created.
---
 llvm/include/llvm/CodeGen/LiveInterval.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/LiveInterval.h b/llvm/include/llvm/CodeGen/LiveInterval.h
index e1c5717f5face..e9ca9e36c95b9 100644
--- a/llvm/include/llvm/CodeGen/LiveInterval.h
+++ b/llvm/include/llvm/CodeGen/LiveInterval.h
@@ -237,9 +237,9 @@ namespace llvm {
     }
 
     /// Constructs a new LiveRange object.
-    LiveRange(bool UseSegmentSet = false)
-        : segmentSet(UseSegmentSet ? std::make_unique<SegmentSet>()
-                                   : nullptr) {}
+    explicit LiveRange(bool UseSegmentSet = false)
+        : segmentSet(UseSegmentSet ? std::make_unique<SegmentSet>() : nullptr) {
+    }
 
     /// Constructs a new LiveRange object by copying segments and valnos from
     /// another LiveRange.

>From 5aeb0f01a28b0fbdde07c26dd0a3e221e3652405 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 29 May 2025 15:06:25 +0200
Subject: [PATCH 04/19] [NFC] MachineFrameInfo::print add a bit more
 informations

---
 llvm/lib/CodeGen/MachineFrameInfo.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index e4b993850f73d..14dc871d89c13 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -221,6 +222,12 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
     if (SO.StackID != 0)
       OS << "id=" << static_cast<unsigned>(SO.StackID) << ' ';
 
+    if (SO.Alloca && !SO.Alloca->getName().empty())
+      OS << "alloca=" << SO.Alloca->getName() << ' ';
+
+    if (SO.isSpillSlot)
+      OS << "spill ";
+
     if (SO.Size == ~0ULL) {
       OS << "dead\n";
       continue;

>From 0ee9f612f02ffb6c7d35301694b9d71b6972f965 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 29 May 2025 17:42:37 +0200
Subject: [PATCH 05/19] [NFC][StackSlotColoring] Remove dead code

---
 llvm/lib/CodeGen/StackSlotColoring.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 2f81bea4e86ba..aaff2d6238c1e 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -369,7 +369,6 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   SmallVector<int, 16> SlotMapping(NumObjs, -1);
   SmallVector<float, 16> SlotWeights(NumObjs, 0.0);
   SmallVector<SmallVector<int, 4>, 16> RevMap(NumObjs);
-  BitVector UsedColors(NumObjs);
 
   LLVM_DEBUG(dbgs() << "Color spill slot intervals:\n");
   bool Changed = false;
@@ -380,7 +379,6 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
     SlotMapping[SS] = NewSS;
     RevMap[NewSS].push_back(SS);
     SlotWeights[NewSS] += li->weight();
-    UsedColors.set(NewSS);
     Changed |= (SS != NewSS);
   }
 

>From a352f1ecacbea556205af5220cc72624fb452047 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 29 May 2025 23:01:22 +0200
Subject: [PATCH 06/19] [NFC][StackColoring] Use block numbers instead of maps

---
 llvm/lib/CodeGen/StackColoring.cpp | 43 +++++++++++++++---------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 609ee3bbc369c..1bb1861a06f44 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -396,11 +396,11 @@ class StackColoring {
 
     /// Which slots are marked as LIVE_OUT, coming out of each basic block.
     BitVector LiveOut;
+
+    bool isEmpty() { return Begin.empty(); }
   };
 
-  /// Maps active slots (per bit) for each basic block.
-  using LivenessMap = DenseMap<const MachineBasicBlock *, BlockLifetimeInfo>;
-  LivenessMap BlockLiveness;
+  SmallVector<BlockLifetimeInfo, 0> BlockLiveness;
 
   /// Maps basic blocks to a serial number.
   SmallVector<const MachineBasicBlock *, 8> BasicBlockNumbering;
@@ -438,9 +438,6 @@ class StackColoring {
   bool run(MachineFunction &Func);
 
 private:
-  /// Used in collectMarkers
-  using BlockBitVecMap = DenseMap<const MachineBasicBlock *, BitVector>;
-
   /// Debug.
   void dump() const;
   void dumpIntervals() const;
@@ -538,9 +535,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
 }
 
 LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
-  LivenessMap::const_iterator BI = BlockLiveness.find(MBB);
-  assert(BI != BlockLiveness.end() && "Block not found");
-  const BlockLifetimeInfo &BlockInfo = BI->second;
+  const BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB->getNumber()];
 
   dumpBV("BEGIN", BlockInfo.Begin);
   dumpBV("END", BlockInfo.End);
@@ -624,7 +619,7 @@ bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI,
 
 unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   unsigned MarkersFound = 0;
-  BlockBitVecMap SeenStartMap;
+  SmallVector<BitVector> SeenStartMap;
   InterestingSlots.clear();
   InterestingSlots.resize(NumSlot);
   ConservativeSlots.clear();
@@ -634,6 +629,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
   SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);
 
+  SeenStartMap.resize(MF->getNumBlockIDs());
+
   // Step 1: collect markers and populate the "InterestingSlots"
   // and "ConservativeSlots" sets.
   for (MachineBasicBlock *MBB : depth_first(MF)) {
@@ -642,10 +639,11 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
     // to this bb).
     BitVector BetweenStartEnd;
     BetweenStartEnd.resize(NumSlot);
+    SeenStartMap[MBB->getNumber()].resize(NumSlot);
     for (const MachineBasicBlock *Pred : MBB->predecessors()) {
-      BlockBitVecMap::const_iterator I = SeenStartMap.find(Pred);
-      if (I != SeenStartMap.end()) {
-        BetweenStartEnd |= I->second;
+      BitVector &PredSet = SeenStartMap[Pred->getNumber()];
+      if (!PredSet.empty()) {
+        BetweenStartEnd |= PredSet;
       }
     }
 
@@ -691,7 +689,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
         }
       }
     }
-    BitVector &SeenStart = SeenStartMap[MBB];
+    BitVector &SeenStart = SeenStartMap[MBB->getNumber()];
     SeenStart |= BetweenStartEnd;
   }
   if (!MarkersFound) {
@@ -718,6 +716,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
 
   LLVM_DEBUG(dumpBV("Conservative slots", ConservativeSlots));
 
+  BlockLiveness.resize(MF->getNumBlockIDs());
   // Step 2: compute begin/end sets for each block
 
   // NOTE: We use a depth-first iteration to ensure that we obtain a
@@ -727,7 +726,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
     BasicBlockNumbering.push_back(MBB);
 
     // Keep a reference to avoid repeated lookups.
-    BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB];
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB->getNumber()];
 
     BlockInfo.Begin.resize(NumSlot);
     BlockInfo.End.resize(NumSlot);
@@ -784,19 +783,19 @@ void StackColoring::calculateLocalLiveness() {
 
     for (const MachineBasicBlock *BB : BasicBlockNumbering) {
       // Use an iterator to avoid repeated lookups.
-      LivenessMap::iterator BI = BlockLiveness.find(BB);
-      assert(BI != BlockLiveness.end() && "Block not found");
-      BlockLifetimeInfo &BlockInfo = BI->second;
+      BlockLifetimeInfo &BlockInfo = BlockLiveness[BB->getNumber()];
+      if (BlockInfo.isEmpty())
+        continue;
 
       // Compute LiveIn by unioning together the LiveOut sets of all preds.
       LocalLiveIn.clear();
       for (MachineBasicBlock *Pred : BB->predecessors()) {
-        LivenessMap::const_iterator I = BlockLiveness.find(Pred);
+        BlockLifetimeInfo &PrefInfo = BlockLiveness[Pred->getNumber()];
         // PR37130: transformations prior to stack coloring can
         // sometimes leave behind statically unreachable blocks; these
         // can be safely skipped here.
-        if (I != BlockLiveness.end())
-          LocalLiveIn |= I->second.LiveOut;
+        if (!PrefInfo.isEmpty())
+          LocalLiveIn |= PrefInfo.LiveOut;
       }
 
       // Compute LiveOut by subtracting out lifetimes that end in this
@@ -840,7 +839,7 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
     DefinitelyInUse.resize(NumSlots);
 
     // Start the interval of the slots that we previously found to be 'in-use'.
-    BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
+    BlockLifetimeInfo &MBBLiveness = BlockLiveness[MBB.getNumber()];
     for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
          pos = MBBLiveness.LiveIn.find_next(pos)) {
       Starts[pos] = Indexes->getMBBStartIdx(&MBB);

>From 5a1a3464cd4b6a25820691a90d27de081cf68258 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 19 Jun 2025 23:01:33 +0200
Subject: [PATCH 07/19] [NFC] Move NumDigits to MathExtras.h and update some
 users of log10 to use NumDigits

---
 .../llvm/DebugInfo/PDB/Native/FormatUtil.h    | 43 -------------------
 llvm/include/llvm/Support/MathExtras.h        | 43 +++++++++++++++++++
 llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp    |  2 +-
 llvm/lib/Support/Signals.cpp                  |  2 +-
 .../llvm-remarkutil/RemarkInstructionMix.cpp  |  2 +-
 llvm/utils/FileCheck/FileCheck.cpp            |  2 +-
 6 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
index 76a019ddf8f34..a76b5c0d44791 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
@@ -62,49 +62,6 @@ LLVM_ABI std::string formatChunkKind(codeview::DebugSubsectionKind Kind,
 LLVM_ABI std::string formatSymbolKind(codeview::SymbolKind K);
 LLVM_ABI std::string formatTypeLeafKind(codeview::TypeLeafKind K);
 
-/// Returns the number of digits in the given integer.
-inline int NumDigits(uint64_t N) {
-  if (N < 10ULL)
-    return 1;
-  if (N < 100ULL)
-    return 2;
-  if (N < 1000ULL)
-    return 3;
-  if (N < 10000ULL)
-    return 4;
-  if (N < 100000ULL)
-    return 5;
-  if (N < 1000000ULL)
-    return 6;
-  if (N < 10000000ULL)
-    return 7;
-  if (N < 100000000ULL)
-    return 8;
-  if (N < 1000000000ULL)
-    return 9;
-  if (N < 10000000000ULL)
-    return 10;
-  if (N < 100000000000ULL)
-    return 11;
-  if (N < 1000000000000ULL)
-    return 12;
-  if (N < 10000000000000ULL)
-    return 13;
-  if (N < 100000000000000ULL)
-    return 14;
-  if (N < 1000000000000000ULL)
-    return 15;
-  if (N < 10000000000000000ULL)
-    return 16;
-  if (N < 100000000000000000ULL)
-    return 17;
-  if (N < 1000000000000000000ULL)
-    return 18;
-  if (N < 10000000000000000000ULL)
-    return 19;
-  return 20;
-}
-
 namespace detail {
 template <typename T>
 struct EndianAdapter final
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index ae3150e5602ee..a374f93d8538e 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -795,6 +795,49 @@ using stack_float_t = volatile float;
 using stack_float_t = float;
 #endif
 
+/// Returns the number of digits in the given integer.
+inline int NumDigits(uint64_t N) {
+  if (N < 10ULL)
+    return 1;
+  if (N < 100ULL)
+    return 2;
+  if (N < 1000ULL)
+    return 3;
+  if (N < 10000ULL)
+    return 4;
+  if (N < 100000ULL)
+    return 5;
+  if (N < 1000000ULL)
+    return 6;
+  if (N < 10000000ULL)
+    return 7;
+  if (N < 100000000ULL)
+    return 8;
+  if (N < 1000000000ULL)
+    return 9;
+  if (N < 10000000000ULL)
+    return 10;
+  if (N < 100000000000ULL)
+    return 11;
+  if (N < 1000000000000ULL)
+    return 12;
+  if (N < 10000000000000ULL)
+    return 13;
+  if (N < 100000000000000ULL)
+    return 14;
+  if (N < 1000000000000000ULL)
+    return 15;
+  if (N < 10000000000000000ULL)
+    return 16;
+  if (N < 100000000000000000ULL)
+    return 17;
+  if (N < 1000000000000000000ULL)
+    return 18;
+  if (N < 10000000000000000000ULL)
+    return 19;
+  return 20;
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index 989fde9749b18..1c6876ce4e87c 100644
--- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -84,7 +84,7 @@ class SourceCode {
   void format(raw_ostream &OS) {
     if (!PrunedSource)
       return;
-    size_t MaxLineNumberWidth = std::ceil(std::log10(LastLine));
+    size_t MaxLineNumberWidth = NumDigits(LastLine);
     int64_t L = FirstLine;
     for (size_t Pos = 0; Pos < PrunedSource->size(); ++L) {
       size_t PosEnd = PrunedSource->find('\n', Pos);
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index 9f9030e79d104..b8449683363b0 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -221,7 +221,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
   for (int i = 0; i < Depth; i++) {
     auto PrintLineHeader = [&]() {
       OS << right_justify(formatv("#{0}", frame_no++).str(),
-                          std::log10(Depth) + 2)
+                          NumDigits(Depth) + 1)
          << ' ' << format_ptr(StackTrace[i]) << ' ';
     };
     if (!Modules[i]) {
diff --git a/llvm/tools/llvm-remarkutil/RemarkInstructionMix.cpp b/llvm/tools/llvm-remarkutil/RemarkInstructionMix.cpp
index 7c8ac474c0fdb..9b0a518f7e49a 100644
--- a/llvm/tools/llvm-remarkutil/RemarkInstructionMix.cpp
+++ b/llvm/tools/llvm-remarkutil/RemarkInstructionMix.cpp
@@ -111,7 +111,7 @@ static Error tryInstructionMix() {
         Mix.begin(), Mix.end(), 1, [](unsigned MaxValue, const MixEntry &Elt) {
           return std::max(MaxValue, Elt.second);
         });
-    unsigned ValueWidth = std::log10(MaxValue) + 1;
+    unsigned ValueWidth = NumDigits(MaxValue);
     FOS << "Instruction";
     FOS.PadToColumn(MaxMnemonic + 1) << "Count\n";
     FOS << "-----------";
diff --git a/llvm/utils/FileCheck/FileCheck.cpp b/llvm/utils/FileCheck/FileCheck.cpp
index 9cf3a3164dfec..96e6e418b96f4 100644
--- a/llvm/utils/FileCheck/FileCheck.cpp
+++ b/llvm/utils/FileCheck/FileCheck.cpp
@@ -595,7 +595,7 @@ static void DumpAnnotatedInput(raw_ostream &OS, const FileCheckRequest &Req,
   unsigned LineCount = InputFileText.count('\n');
   if (InputFileEnd[-1] != '\n')
     ++LineCount;
-  unsigned LineNoWidth = std::log10(LineCount) + 1;
+  unsigned LineNoWidth = NumDigits(LineCount);
   // +3 below adds spaces (1) to the left of the (right-aligned) line numbers
   // on input lines and (2) to the right of the (left-aligned) labels on
   // annotation lines so that input lines and annotation lines are more

>From 353a1e58ef7085b19c577c2469c10c677e7bbe30 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 19 Jun 2025 23:05:40 +0200
Subject: [PATCH 08/19] [NFC] Improve debug output of StackColoring

---
 llvm/lib/CodeGen/StackColoring.cpp | 50 ++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 1bb1861a06f44..64a5e294d2a0c 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -442,7 +442,6 @@ class StackColoring {
   void dump() const;
   void dumpIntervals() const;
   void dumpBB(MachineBasicBlock *MBB) const;
-  void dumpBV(const char *tag, const BitVector &BV) const;
 
   /// Removes all of the lifetime marker instructions from the function.
   /// \returns true if any markers were removed.
@@ -526,12 +525,39 @@ void StackColoringLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
-                                            const BitVector &BV) const {
-  dbgs() << tag << " : { ";
-  for (unsigned I = 0, E = BV.size(); I != E; ++I)
-    dbgs() << BV.test(I) << " ";
-  dbgs() << "}\n";
+
+LLVM_DUMP_METHOD void dumpBV(StringRef tag, const BitVector &BV) {
+  constexpr unsigned ColumnWidth = 150;
+  unsigned LineStartOffset = tag.size() + /*" : "*/ 3;
+  unsigned WidthAfterTag = ColumnWidth - LineStartOffset;
+  unsigned NumBitsPerColumn = WidthAfterTag / 2;
+  unsigned BitsCount = BV.size();
+  for (unsigned Bits = 0; Bits < BitsCount; Bits += NumBitsPerColumn) {
+    unsigned Start = Bits;
+    unsigned End = std::min(Start + NumBitsPerColumn, BitsCount);
+
+    dbgs() << tag << " : ";
+
+    for (unsigned I = Start; I < End; ++I)
+      dbgs() << BV.test(I) << " ";
+    dbgs() << '\n';
+    dbgs() << tag << " : ";
+    unsigned next = Start;
+    for (unsigned I = Start; I < End; ++I) {
+      if (I < next)
+        continue;
+      if (BV.test(I)) {
+        int numDigits = NumDigits(I);
+        // Make sure number have spacing while staying aligned to the line above
+        next = I + 1 + numDigits / 2;
+        dbgs() << I << ' ';
+        if (numDigits % 2 == 0)
+          dbgs() << ' ';
+      } else
+        dbgs() << "  ";
+    }
+    dbgs() << '\n';
+  }
 }
 
 LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
@@ -553,8 +579,14 @@ LLVM_DUMP_METHOD void StackColoring::dump() const {
 
 LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
   for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
-    dbgs() << "Interval[" << I << "]:\n";
-    Intervals[I]->dump();
+    dbgs() << "Interval[" << I << "]:";
+    if (MFI->getObjectAllocation(I))
+      dbgs() << *MFI->getObjectAllocation(I);
+    dbgs() << '\n' << *Intervals[I] << '\n';
+    dbgs() << "LiveStarts:";
+    for (SlotIndex SIdx : LiveStarts[I])
+      dbgs() << ' ' << SIdx;
+    dbgs() << '\n';
   }
 }
 #endif

>From 3c5f135c17c5189354749ecac86dba592360e91c Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 29 May 2025 16:18:52 +0200
Subject: [PATCH 09/19] [NFC][LiveStacks] Use vectors instead of map and
 unordred_map

---
 llvm/include/llvm/CodeGen/LiveStacks.h        | 44 ++++++++-----------
 llvm/lib/CodeGen/LiveStacks.cpp               | 41 +++++++++--------
 llvm/lib/CodeGen/StackSlotColoring.cpp        | 20 +++------
 .../AMDGPU/AMDGPUMarkLastScratchLoad.cpp      |  6 +--
 4 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/LiveStacks.h b/llvm/include/llvm/CodeGen/LiveStacks.h
index 02c640bfc4a93..3b4550901dc7e 100644
--- a/llvm/include/llvm/CodeGen/LiveStacks.h
+++ b/llvm/include/llvm/CodeGen/LiveStacks.h
@@ -40,49 +40,43 @@ class LiveStacks {
   ///
   VNInfo::Allocator VNInfoAllocator;
 
-  /// S2IMap - Stack slot indices to live interval mapping.
-  using SS2IntervalMap = std::unordered_map<int, LiveInterval>;
-  SS2IntervalMap S2IMap;
-
-  /// S2RCMap - Stack slot indices to register class mapping.
-  std::map<int, const TargetRegisterClass *> S2RCMap;
+  int StartIdx = -1;
+  SmallVector<LiveInterval *> S2LI;
+  SmallVector<const TargetRegisterClass *> S2RC;
 
 public:
-  using iterator = SS2IntervalMap::iterator;
-  using const_iterator = SS2IntervalMap::const_iterator;
+  using iterator = SmallVector<LiveInterval *>::iterator;
+  using const_iterator = SmallVector<LiveInterval *>::const_iterator;
 
-  const_iterator begin() const { return S2IMap.begin(); }
-  const_iterator end() const { return S2IMap.end(); }
-  iterator begin() { return S2IMap.begin(); }
-  iterator end() { return S2IMap.end(); }
+  const_iterator begin() const { return S2LI.begin(); }
+  const_iterator end() const { return S2LI.end(); }
+  iterator begin() { return S2LI.begin(); }
+  iterator end() { return S2LI.end(); }
 
-  unsigned getNumIntervals() const { return (unsigned)S2IMap.size(); }
+  unsigned getStartIdx() const { return StartIdx; }
+  unsigned getNumIntervals() const { return (unsigned)S2LI.size(); }
 
   LiveInterval &getOrCreateInterval(int Slot, const TargetRegisterClass *RC);
 
   LiveInterval &getInterval(int Slot) {
     assert(Slot >= 0 && "Spill slot indice must be >= 0");
-    SS2IntervalMap::iterator I = S2IMap.find(Slot);
-    assert(I != S2IMap.end() && "Interval does not exist for stack slot");
-    return I->second;
+    return *S2LI[Slot - StartIdx];
   }
 
   const LiveInterval &getInterval(int Slot) const {
     assert(Slot >= 0 && "Spill slot indice must be >= 0");
-    SS2IntervalMap::const_iterator I = S2IMap.find(Slot);
-    assert(I != S2IMap.end() && "Interval does not exist for stack slot");
-    return I->second;
+    return *S2LI[Slot - StartIdx];
   }
 
-  bool hasInterval(int Slot) const { return S2IMap.count(Slot); }
+  bool hasInterval(int Slot) const {
+    if (Slot < StartIdx || StartIdx == -1)
+      return false;
+    return !getInterval(Slot).empty();
+  }
 
   const TargetRegisterClass *getIntervalRegClass(int Slot) const {
     assert(Slot >= 0 && "Spill slot indice must be >= 0");
-    std::map<int, const TargetRegisterClass *>::const_iterator I =
-        S2RCMap.find(Slot);
-    assert(I != S2RCMap.end() &&
-           "Register class info does not exist for stack slot");
-    return I->second;
+    return S2RC[Slot - StartIdx];
   }
 
   VNInfo::Allocator &getVNInfoAllocator() { return VNInfoAllocator; }
diff --git a/llvm/lib/CodeGen/LiveStacks.cpp b/llvm/lib/CodeGen/LiveStacks.cpp
index c07d985a09d1f..ea158b2d96a4e 100644
--- a/llvm/lib/CodeGen/LiveStacks.cpp
+++ b/llvm/lib/CodeGen/LiveStacks.cpp
@@ -37,10 +37,12 @@ void LiveStacksWrapperLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void LiveStacks::releaseMemory() {
+  for (int Idx = 0; Idx < (int)S2LI.size(); ++Idx)
+    S2LI[Idx]->~LiveInterval();
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
-  S2IMap.clear();
-  S2RCMap.clear();
+  S2LI.clear();
+  S2RC.clear();
 }
 
 void LiveStacks::init(MachineFunction &MF) {
@@ -52,20 +54,22 @@ void LiveStacks::init(MachineFunction &MF) {
 LiveInterval &
 LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) {
   assert(Slot >= 0 && "Spill slot indice must be >= 0");
-  SS2IntervalMap::iterator I = S2IMap.find(Slot);
-  if (I == S2IMap.end()) {
-    I = S2IMap
-            .emplace(
-                std::piecewise_construct, std::forward_as_tuple(Slot),
-                std::forward_as_tuple(Register::index2StackSlot(Slot), 0.0F))
-            .first;
-    S2RCMap.insert(std::make_pair(Slot, RC));
+  if (StartIdx == -1)
+    StartIdx = Slot;
+
+  int Idx = Slot - StartIdx;
+  assert(Idx >= 0 && "Slot not in order ?");
+  if (Idx < (int)S2LI.size()) {
+    S2RC[Idx] = TRI->getCommonSubClass(S2RC[Idx], RC);
   } else {
-    // Use the largest common subclass register class.
-    const TargetRegisterClass *&OldRC = S2RCMap[Slot];
-    OldRC = TRI->getCommonSubClass(OldRC, RC);
+    S2RC.resize(Idx + 1);
+    S2LI.resize(Idx + 1);
+    S2LI[Idx] = this->VNInfoAllocator.Allocate<LiveInterval>();
+    new (S2LI[Idx]) LiveInterval(Register::index2StackSlot(Slot), 0.0F);
+    S2RC[Idx] = RC;
   }
-  return I->second;
+  assert(S2RC.size() == S2LI.size());
+  return *S2LI[Idx];
 }
 
 AnalysisKey LiveStacksAnalysis::Key;
@@ -96,13 +100,12 @@ void LiveStacksWrapperLegacy::print(raw_ostream &OS, const Module *) const {
 }
 
 /// print - Implement the dump method.
-void LiveStacks::print(raw_ostream &OS, const Module*) const {
+void LiveStacks::print(raw_ostream &OS, const Module *) const {
 
   OS << "********** INTERVALS **********\n";
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
-    I->second.print(OS);
-    int Slot = I->first;
-    const TargetRegisterClass *RC = getIntervalRegClass(Slot);
+  for (int Idx = 0; Idx < (int)S2LI.size(); ++Idx) {
+    S2LI[Idx]->print(OS);
+    const TargetRegisterClass *RC = S2RC[Idx];
     if (RC)
       OS << " [" << TRI->getRegClassName(RC) << "]\n";
     else
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index aaff2d6238c1e..95597be5f1ebe 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -262,24 +262,14 @@ void StackSlotColoring::InitializeSlots() {
   UsedColors[0].resize(LastFI);
   Assignments.resize(LastFI);
 
-  using Pair = std::iterator_traits<LiveStacks::iterator>::value_type;
-
-  SmallVector<Pair *, 16> Intervals;
-
-  Intervals.reserve(LS->getNumIntervals());
-  for (auto &I : *LS)
-    Intervals.push_back(&I);
-  llvm::sort(Intervals,
-             [](Pair *LHS, Pair *RHS) { return LHS->first < RHS->first; });
-
   // Gather all spill slots into a list.
   LLVM_DEBUG(dbgs() << "Spill slot intervals:\n");
-  for (auto *I : Intervals) {
-    LiveInterval &li = I->second;
-    LLVM_DEBUG(li.dump());
-    int FI = li.reg().stackSlotIndex();
-    if (MFI->isDeadObjectIndex(FI))
+  for (auto [Idx, I] : llvm::enumerate(*LS)) {
+    int FI = Idx + LS->getStartIdx();
+    if (!I || MFI->isDeadObjectIndex(FI))
       continue;
+    LiveInterval &li = *I;
+    LLVM_DEBUG(li.dump());
 
     SSIntervals.push_back(&li);
     OrigAlignments[FI] = MFI->getObjectAlign(FI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
index 9b6bb56c85d24..2dcf695e9c583 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
@@ -102,15 +102,15 @@ bool AMDGPUMarkLastScratchLoad::run(MachineFunction &MF) {
 
   bool Changed = false;
 
-  for (auto &[SS, LI] : *LS) {
-    for (const LiveRange::Segment &Segment : LI.segments) {
+  for (auto *LI : *LS) {
+    for (const LiveRange::Segment &Segment : LI->segments) {
 
       // Ignore segments that run to the end of basic block because in this case
       // slot is still live at the end of it.
       if (Segment.end.isBlock())
         continue;
 
-      const int FrameIndex = LI.reg().stackSlotIndex();
+      const int FrameIndex = LI->reg().stackSlotIndex();
       MachineInstr *LastLoad = nullptr;
 
       MachineInstr *MISegmentEnd = SI->getInstructionFromIndex(Segment.end);

>From 0ff8b6c91202310c465a39a8dc8748db16c39f6a Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 22 May 2025 20:11:39 +0200
Subject: [PATCH 10/19] [NFC][CodeGen] Cleanup lifetime in StackColoring
 instead of DeadMachineInstructionElim

---
 llvm/lib/CodeGen/MachineInstr.cpp      |  4 ---
 llvm/lib/CodeGen/StackColoring.cpp     | 21 +++++++++++++---
 llvm/test/CodeGen/X86/StackColoring.ll | 35 ++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index da3665b3b6a0b..3e5fd59534105 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1417,10 +1417,6 @@ bool MachineInstr::isDead(const MachineRegisterInfo &MRI,
   if (isInlineAsm())
     return false;
 
-  // FIXME: See issue #105950 for why LIFETIME markers are considered dead here.
-  if (isLifetimeMarker())
-    return true;
-
   // If there are no defs with uses, then we call the instruction dead so long
   // as we do not suspect it may have sideeffects.
   return wouldBeTriviallyDead();
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 64a5e294d2a0c..22f50c88dec21 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -686,8 +686,10 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
       if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
           MI.getOpcode() == TargetOpcode::LIFETIME_END) {
         int Slot = getStartOrEndSlot(MI);
-        if (Slot < 0)
+        if (Slot < 0) {
+          Markers.push_back(&MI);
           continue;
+        }
         InterestingSlots.set(Slot);
         if (MI.getOpcode() == TargetOpcode::LIFETIME_START) {
           BetweenStartEnd.set(Slot);
@@ -927,6 +929,17 @@ bool StackColoring::removeAllMarkers() {
   }
   Markers.clear();
 
+  for (MachineBasicBlock &MBB : *MF) {
+    if (BlockLiveness.empty() || BlockLiveness[MBB.getNumber()].isEmpty())
+      for (MachineInstr &MI : make_early_inc_range(MBB)) {
+        if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+            MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+          Count++;
+          MI.eraseFromParent();
+        }
+      }
+  }
+
   LLVM_DEBUG(dbgs() << "Removed " << Count << " markers.\n");
   return Count;
 }
@@ -1247,8 +1260,8 @@ bool StackColoring::run(MachineFunction &Func) {
   unsigned NumSlots = MFI->getObjectIndexEnd();
 
   // If there are no stack slots then there are no markers to remove.
-  if (!NumSlots)
-    return false;
+  if (!NumSlots || DisableColoring)
+    return removeAllMarkers();
 
   SmallVector<int, 8> SortedSlots;
   SortedSlots.reserve(NumSlots);
@@ -1272,7 +1285,7 @@ bool StackColoring::run(MachineFunction &Func) {
 
   // Don't continue because there are not enough lifetime markers, or the
   // stack is too small, or we are told not to optimize the slots.
-  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring) {
+  if (NumMarkers < 2 || TotalSize < 16) {
     LLVM_DEBUG(dbgs() << "Will not try to merge slots.\n");
     return removeAllMarkers();
   }
diff --git a/llvm/test/CodeGen/X86/StackColoring.ll b/llvm/test/CodeGen/X86/StackColoring.ll
index db3e7dcdfe2d5..4cc54c5bd1361 100644
--- a/llvm/test/CodeGen/X86/StackColoring.ll
+++ b/llvm/test/CodeGen/X86/StackColoring.ll
@@ -581,6 +581,41 @@ onerr:
 
 %Data = type { [32 x i64] }
 
+declare void @throw()
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @llvm.trap()
+
+;CHECK-LABEL: removed_all_lifetime:
+;YESCOLOR-NOT: LIFETIME_END
+;NOFIRSTUSE-NOT: LIFETIME_END
+;NOCOLOR-NOT: LIFETIME_END
+define void @removed_all_lifetime() personality ptr @__CxxFrameHandler3 {
+entry:
+  %alloca2 = alloca ptr, align 4
+  %alloca1 = alloca ptr, align 4
+  store volatile ptr null, ptr %alloca1
+  invoke void @throw()
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %cs = catchswitch within none [label %catch.pad] unwind to caller
+
+catch.pad:                                        ; preds = %catch.dispatch
+  %cp = catchpad within %cs [ptr null, i32 0, ptr %alloca1]
+  %v = load volatile ptr, ptr %alloca1
+  store volatile ptr null, ptr %alloca1
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %alloca1)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %alloca2)
+  store volatile ptr null, ptr %alloca1
+  call void @llvm.trap()
+  unreachable
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
 declare void @destructor()
 
 declare void @inita(ptr)

>From eef9b6e892668d00fb1dc8356126cb334c86d818 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Fri, 23 May 2025 22:23:00 +0200
Subject: [PATCH 11/19] [CodeGen] Add option to move StackColoring after
 register allocation + deal with direct fallout

---
 llvm/include/llvm/CodeGen/MachineInstr.h |  4 +++-
 llvm/lib/CodeGen/StackColoring.cpp       |  2 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp    | 17 ++++++++++++++---
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 94d04b82666be..faf860c656af4 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1340,7 +1340,9 @@ class MachineInstr
   }
 
   // True if the instruction represents a position in the function.
-  bool isPosition() const { return isLabel() || isCFIInstruction(); }
+  bool isPosition() const {
+    return isLifetimeMarker() || isLabel() || isCFIInstruction();
+  }
 
   bool isNonListDebugValue() const {
     return getOpcode() == TargetOpcode::DBG_VALUE;
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 22f50c88dec21..c5fd00b558d9f 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -686,7 +686,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
       if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
           MI.getOpcode() == TargetOpcode::LIFETIME_END) {
         int Slot = getStartOrEndSlot(MI);
-        if (Slot < 0) {
+        if (Slot < 0 || MFI->isObjectPreAllocated(Slot)) {
           Markers.push_back(&MI);
           continue;
         }
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 4ae52b056d844..864c7c8acd3b2 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -266,6 +266,9 @@ static cl::opt<bool>
                     cl::desc("Split static data sections into hot and cold "
                              "sections using profile information"));
 
+static cl::opt<bool> MergedStackColoring("merged-stack-coloring",
+                                         cl::init(false), cl::Hidden);
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -1305,9 +1308,11 @@ void TargetPassConfig::addMachineSSAOptimization() {
   // instructions dead.
   addPass(&OptimizePHIsLegacyID);
 
-  // This pass merges large allocas. StackSlotColoring is a different pass
-  // which merges spill slots.
-  addPass(&StackColoringLegacyID);
+  if (!MergedStackColoring) {
+    // This pass merges large allocas. StackSlotColoring is a different pass
+    // which merges spill slots.
+    addPass(&StackColoringLegacyID);
+  }
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
@@ -1496,6 +1501,12 @@ void TargetPassConfig::addOptimizedRegAlloc() {
     // Perform stack slot coloring and post-ra machine LICM.
     addPass(&StackSlotColoringID);
 
+    if (MergedStackColoring) {
+      // This pass merges large allocas. StackSlotColoring is a different pass
+      // which merges spill slots.
+      addPass(&StackColoringLegacyID);
+    }
+
     // Allow targets to expand pseudo instructions depending on the choice of
     // registers before MachineCopyPropagation.
     addPostRewrite();

>From fb19fd01466926f6976cb8a494c3eabb66cab2c1 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Tue, 10 Jun 2025 22:44:32 +0200
Subject: [PATCH 12/19] Add new StackColoring algo

---
 llvm/include/llvm/CodeGen/MachineFrameInfo.h |  34 +-
 llvm/lib/CodeGen/MachineFrameInfo.cpp        |  22 +-
 llvm/lib/CodeGen/PrologEpilogInserter.cpp    |  29 +-
 llvm/lib/CodeGen/StackColoring.cpp           | 570 +++++++++++++++----
 4 files changed, 530 insertions(+), 125 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 403e5eda949f1..fdb2fbd133397 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -120,12 +120,18 @@ class MachineFrameInfo {
                       ///< triggered protection.  3rd closest to the protector.
   };
 
+  static constexpr int NoUnderlyingSlot = std::numeric_limits<int>::min();
+  static constexpr int IsUnderlyingSlot = std::numeric_limits<int>::min() + 1;
+
 private:
   // Represent a single object allocated on the stack.
   struct StackObject {
     // The offset of this object from the stack pointer on entry to
     // the function.  This field has no meaning for a variable sized element.
-    int64_t SPOffset;
+    // After getting placed this is relative to SP
+    // If UnderlyingSlot is not NoUnderlyingSlot, this is relative to the start
+    // of the UnderlyingSlot
+    int64_t Offset;
 
     // The size of this object on the stack. 0 means a variable sized object,
     // ~0ULL means a dead object.
@@ -134,6 +140,10 @@ class MachineFrameInfo {
     // The required alignment of this stack slot.
     Align Alignment;
 
+    // If not NoUnderlyingSlot, it Indicate that this slot should be placed
+    // at Offset, into the slot UnderlyingSlot
+    int UnderlyingSlot = NoUnderlyingSlot;
+
     // If true, the value of the stack object is set before
     // entering the function and is not modified inside the function. By
     // default, fixed objects are immutable unless marked otherwise.
@@ -183,10 +193,10 @@ class MachineFrameInfo {
 
     uint8_t SSPLayout = SSPLK_None;
 
-    StackObject(uint64_t Size, Align Alignment, int64_t SPOffset,
+    StackObject(uint64_t Size, Align Alignment, int64_t Offset,
                 bool IsImmutable, bool IsSpillSlot, const AllocaInst *Alloca,
                 bool IsAliased, uint8_t StackID = 0)
-        : SPOffset(SPOffset), Size(Size), Alignment(Alignment),
+        : Offset(Offset), Size(Size), Alignment(Alignment),
           isImmutable(IsImmutable), isSpillSlot(IsSpillSlot), StackID(StackID),
           Alloca(Alloca), isAliased(IsAliased) {}
   };
@@ -532,7 +542,7 @@ class MachineFrameInfo {
            "Invalid Object Idx!");
     assert(!isDeadObjectIndex(ObjectIdx) &&
            "Getting frame offset for a dead object?");
-    return Objects[ObjectIdx+NumFixedObjects].SPOffset;
+    return Objects[ObjectIdx + NumFixedObjects].Offset;
   }
 
   bool isObjectZExt(int ObjectIdx) const {
@@ -561,12 +571,12 @@ class MachineFrameInfo {
 
   /// Set the stack frame offset of the specified object. The
   /// offset is relative to the stack pointer on entry to the function.
-  void setObjectOffset(int ObjectIdx, int64_t SPOffset) {
+  void setObjectOffset(int ObjectIdx, int64_t Offset) {
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
            "Invalid Object Idx!");
     assert(!isDeadObjectIndex(ObjectIdx) &&
            "Setting frame offset for a dead object?");
-    Objects[ObjectIdx+NumFixedObjects].SPOffset = SPOffset;
+    Objects[ObjectIdx + NumFixedObjects].Offset = Offset;
   }
 
   SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const {
@@ -762,6 +772,18 @@ class MachineFrameInfo {
     // If ID == 0, MaxAlignment will need to be updated separately.
   }
 
+  int getUnderlyingSlot(int ObjectIdx) {
+    assert(unsigned(ObjectIdx + NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    return Objects[ObjectIdx + NumFixedObjects].UnderlyingSlot;
+  }
+
+  void setUnderlyingSlot(int ObjectIdx, int Underlying) {
+    assert(unsigned(ObjectIdx + NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    Objects[ObjectIdx + NumFixedObjects].UnderlyingSlot = Underlying;
+  }
+
   /// Returns true if the specified index corresponds to a dead object.
   bool isDeadObjectIndex(int ObjectIdx) const {
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index 14dc871d89c13..e3d1761ef894a 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -81,7 +81,7 @@ int MachineFrameInfo::CreateVariableSizedObject(Align Alignment,
   return (int)Objects.size()-NumFixedObjects-1;
 }
 
-int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
+int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t Offset,
                                         bool IsImmutable, bool IsAliased) {
   assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
   // The alignment of the frame index can be determined from its offset from
@@ -91,23 +91,22 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   // stack needs realignment, we can't assume that the stack will in fact be
   // aligned.
   Align Alignment =
-      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, SPOffset);
+      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, Offset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
-                 StackObject(Size, Alignment, SPOffset, IsImmutable,
+                 StackObject(Size, Alignment, Offset, IsImmutable,
                              /*IsSpillSlot=*/false, /*Alloca=*/nullptr,
                              IsAliased));
   return -++NumFixedObjects;
 }
 
-int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
-                                                  int64_t SPOffset,
+int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size, int64_t Offset,
                                                   bool IsImmutable) {
   Align Alignment =
-      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, SPOffset);
+      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, Offset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
-                 StackObject(Size, Alignment, SPOffset, IsImmutable,
+                 StackObject(Size, Alignment, Offset, IsImmutable,
                              /*IsSpillSlot=*/true, /*Alloca=*/nullptr,
                              /*IsAliased=*/false));
   return -++NumFixedObjects;
@@ -240,8 +239,13 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
 
     if (i < NumFixedObjects)
       OS << ", fixed";
-    if (i < NumFixedObjects || SO.SPOffset != -1) {
-      int64_t Off = SO.SPOffset - ValOffset;
+    if (SO.UnderlyingSlot == MachineFrameInfo::IsUnderlyingSlot)
+      OS << ", underlying";
+    if (SO.UnderlyingSlot > MachineFrameInfo::IsUnderlyingSlot) {
+      OS << ", placed=" << "fi#" << (int)(SO.UnderlyingSlot - NumFixedObjects)
+         << "+" << SO.Offset;
+    } else if (i < NumFixedObjects || SO.Offset != -1) {
+      int64_t Off = SO.Offset - ValOffset;
       OS << ", at location [SP";
       if (Off > 0)
         OS << "+" << Off;
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index f66f54682c84c..7a44b3937a63b 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -694,6 +694,13 @@ void PEIImpl::spillCalleeSavedRegs(MachineFunction &MF) {
   }
 }
 
+static inline void UpdateOffset(MachineFrameInfo &MFI, int FrameIdx,
+                                int64_t Offset) {
+  LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset
+                    << "]\n");
+  MFI.setObjectOffset(FrameIdx, Offset); // Set the computed offset
+}
+
 /// AdjustStackOffset - Helper function used to adjust the stack frame offset.
 static inline void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
                                      bool StackGrowsDown, int64_t &Offset,
@@ -712,13 +719,9 @@ static inline void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
   Offset = alignTo(Offset, Alignment);
 
   if (StackGrowsDown) {
-    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset
-                      << "]\n");
-    MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+    UpdateOffset(MFI, FrameIdx, -Offset);
   } else {
-    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset
-                      << "]\n");
-    MFI.setObjectOffset(FrameIdx, Offset);
+    UpdateOffset(MFI, FrameIdx, Offset);
     Offset += MFI.getObjectSize(FrameIdx);
   }
 }
@@ -1044,6 +1047,7 @@ void PEIImpl::calculateFrameObjectOffsets(MachineFunction &MF) {
   }
 
   SmallVector<int, 8> ObjectsToAllocate;
+  SmallVector<int, 8> UpdateOffsetAfterAllocate;
 
   // Then prepare to assign frame offsets to stack objects that are not used to
   // spill callee saved registers.
@@ -1064,6 +1068,11 @@ void PEIImpl::calculateFrameObjectOffsets(MachineFunction &MF) {
     if (MFI.getStackID(i) != TargetStackID::Default)
       continue;
 
+    if (MFI.getUnderlyingSlot(i) > MachineFrameInfo::IsUnderlyingSlot) {
+      UpdateOffsetAfterAllocate.push_back(i);
+      continue;
+    }
+
     // Add the objects that we need to allocate to our working set.
     ObjectsToAllocate.push_back(i);
   }
@@ -1104,6 +1113,14 @@ void PEIImpl::calculateFrameObjectOffsets(MachineFunction &MF) {
       AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign);
   }
 
+  for (int FrameIdx : UpdateOffsetAfterAllocate) {
+    int UnderlyingSlot = MFI.getUnderlyingSlot(FrameIdx);
+    int64_t ObjOffset =
+        MFI.getObjectOffset(UnderlyingSlot) + MFI.getObjectOffset(FrameIdx);
+    UpdateOffset(MFI, FrameIdx, ObjOffset);
+    MFI.setUnderlyingSlot(FrameIdx, MachineFrameInfo::NoUnderlyingSlot);
+  }
+
   if (!TFI.targetHandlesStackFrameRounding()) {
     // If we have reserved argument space for call sites in the function
     // immediately on entry to the current function, count it as part of the
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index c5fd00b558d9f..4cfdc678643f8 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -55,6 +56,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/DebugCounter.h"
 #include <algorithm>
 #include <cassert>
 #include <limits>
@@ -65,6 +67,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "stack-coloring"
 
+DEBUG_COUNTER(ProcessSlot, DEBUG_TYPE "-slot",
+              "Controls which slot get processed");
+
 static cl::opt<bool>
 DisableColoring("no-stack-coloring",
         cl::init(false), cl::Hidden,
@@ -90,8 +95,19 @@ LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use",
         cl::init(true), cl::Hidden,
         cl::desc("Treat stack lifetimes as starting on first use, not on START marker."));
 
+static cl::opt<bool> UseNewStackColoring(
+    "new-stack-coloring", cl::init(false), cl::Hidden,
+    cl::desc("Use a better logic to try to reduce stack usage"));
+
+static constexpr unsigned MaxCandidatesToConsiderDefault = 5;
+static cl::opt<unsigned> MaxCandidatesToConsider(
+    "stackcoloring-max-candidates", cl::init(MaxCandidatesToConsiderDefault),
+    cl::Hidden,
+    cl::desc(
+        "Max number of candidates that will be evaluated, 0 means no limit"));
 
 STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
+STATISTIC(GeneratedWorse, "Number of times worse layout were generated");
 STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
 STATISTIC(StackSlotMerged, "Number of stack slot merged.");
 STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
@@ -375,12 +391,43 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 
 namespace {
 
+constexpr unsigned InvalidIdx = -1;
+
 /// StackColoring - A machine pass for merging disjoint stack allocations,
 /// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
 class StackColoring {
   MachineFrameInfo *MFI = nullptr;
   MachineFunction *MF = nullptr;
 
+  struct SlotInfo {
+    // All places in the current function where this Slot is live
+    BitVector Liveness;
+
+    // Use to make overlap queries faster
+    SmallVector<unsigned, 4> StartLiveness;
+
+    uint64_t SlotPriority = 0;
+
+    unsigned Offset = InvalidIdx;
+
+    unsigned Size = 0;
+
+    Align Align;
+
+    bool hasOverlap(SlotInfo &Other) {
+      // NOTE: This is not just a faster way to say
+      // return Liveness.anyCommon(Other.Liveness);
+      // This also allows merging slots that have overlapping lifetimes but
+      // cannot be live simultaneously
+      return any_of(StartLiveness,
+                    [&](int Idx) { return Other.Liveness[Idx]; }) ||
+             any_of(Other.StartLiveness,
+                    [&](int Idx) { return Liveness[Idx]; });
+    }
+
+    LLVM_DUMP_METHOD void dump(const StackColoring* State = nullptr) const;
+  };
+
   /// A class representing liveness information for a single basic block.
   /// Each bit in the BitVector represents the liveness property
   /// for a different stack slot.
@@ -405,6 +452,9 @@ class StackColoring {
   /// Maps basic blocks to a serial number.
   SmallVector<const MachineBasicBlock *, 8> BasicBlockNumbering;
 
+  unsigned LivenessSize;
+  SmallVector<SlotInfo, 0> Slot2Info;
+
   /// Maps slots to their use interval. Outside of this interval, slots
   /// values are either dead or `undef` and they will not be written to.
   SmallVector<std::unique_ptr<LiveRange>, 16> Intervals;
@@ -458,6 +508,8 @@ class StackColoring {
   /// in and out blocks.
   void calculateLocalLiveness();
 
+  unsigned doMerging(unsigned NumSlots);
+
   /// Returns TRUE if we're using the first-use-begins-lifetime method for
   /// this slot (if FALSE, then the start marker is treated as start of lifetime).
   bool applyFirstUse(int Slot) {
@@ -482,7 +534,7 @@ class StackColoring {
 
   /// Go over the machine function and change instructions which use stack
   /// slots to use the joint slots.
-  void remapInstructions(DenseMap<int, int> &SlotRemap);
+  void remapInstructions(DenseMap<int, int> &SlotRemap, int MergedSlot);
 
   /// The input program may contain instructions which are not inside lifetime
   /// markers. This can happen due to a bug in the compiler or due to a bug in
@@ -527,6 +579,10 @@ void StackColoringLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
 LLVM_DUMP_METHOD void dumpBV(StringRef tag, const BitVector &BV) {
+  if (BV.size() == 0) {
+    dbgs() << tag << " : EMPTY\n";
+    return;
+  }
   constexpr unsigned ColumnWidth = 150;
   unsigned LineStartOffset = tag.size() + /*" : "*/ 3;
   unsigned WidthAfterTag = ColumnWidth - LineStartOffset;
@@ -588,7 +644,38 @@ LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
       dbgs() << ' ' << SIdx;
     dbgs() << '\n';
   }
+  for (unsigned Slot = 0; Slot < Slot2Info.size(); Slot++) {
+    Slot2Info[Slot].dump(this);
+  }
 }
+
+LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State) const {
+  unsigned Slot = InvalidIdx;
+  if (State)
+    Slot = this - State->Slot2Info.data();
+  dbgs() << "SlotInfo"; 
+  if (State)
+    dbgs() << "(" << Slot << ")";
+  dbgs()<< ": ";
+  dbgs() << '\n';
+  if (State)
+    if (State->MFI->getObjectAllocation(Slot)) {
+      State->MFI->getObjectAllocation(Slot)->print(dbgs());
+      dbgs() << '\n';
+    }
+  dbgs() << "Size=" << Size << " Align=" << Align.value() << '\n';
+  dumpBV("LIVENESS   ", Liveness);
+  BitVector Start;
+  Start.resize(Liveness.size());
+  for (unsigned idx : StartLiveness) {
+    if (idx >= Start.size())
+      Start.resize(idx + 1);
+    Start[idx] = true;
+  }
+  dumpBV("LIVE START ", Start);
+  dbgs() << "\n";
+}
+
 #endif
 
 static inline int getStartOrEndSlot(const MachineInstr &MI)
@@ -862,23 +949,39 @@ void StackColoring::calculateLocalLiveness() {
 
 void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
   SmallVector<SlotIndex, 16> Starts;
-  SmallVector<bool, 16> DefinitelyInUse;
+  BitVector DefinitelyInUse;
+  SmallVector<int, 16> StartIdx;
+
+  int CurrIdx = 0;
+
+  DefinitelyInUse.resize(NumSlots);
 
   // For each block, find which slots are active within this block
   // and update the live intervals.
   for (const MachineBasicBlock &MBB : *MF) {
-    Starts.clear();
-    Starts.resize(NumSlots);
-    DefinitelyInUse.clear();
-    DefinitelyInUse.resize(NumSlots);
+    Starts.assign(NumSlots, SlotIndex());
+    StartIdx.assign(NumSlots, -1);
+    DefinitelyInUse.reset();
 
     // Start the interval of the slots that we previously found to be 'in-use'.
     BlockLifetimeInfo &MBBLiveness = BlockLiveness[MBB.getNumber()];
-    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
-         pos = MBBLiveness.LiveIn.find_next(pos)) {
+    for (int pos : MBBLiveness.LiveIn.set_bits()) {
       Starts[pos] = Indexes->getMBBStartIdx(&MBB);
+      StartIdx[pos] = CurrIdx;
     }
 
+    bool StartedSinceInc = false;
+    auto EndRangeFor = [&](int Slot) {
+      if (StartIdx[Slot] == CurrIdx || StartedSinceInc) {
+        CurrIdx++;
+        StartedSinceInc = false;
+      }
+      Slot2Info[Slot].Liveness.resize(CurrIdx + 1);
+      Slot2Info[Slot].Liveness.set(StartIdx[Slot], CurrIdx);
+      StartIdx[Slot] = -1;
+      DefinitelyInUse[Slot] = false;
+    };
+
     // Create the interval for the basic blocks containing lifetime begin/end.
     for (const MachineInstr &MI : MBB) {
       SmallVector<int, 4> slots;
@@ -888,16 +991,21 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
       for (auto Slot : slots) {
         if (IsStart) {
+          StartedSinceInc = true;
           // If a slot is already definitely in use, we don't have to emit
           // a new start marker because there is already a pre-existing
           // one.
           if (!DefinitelyInUse[Slot]) {
             LiveStarts[Slot].push_back(ThisIndex);
+            Slot2Info[Slot].StartLiveness.push_back(CurrIdx);
             DefinitelyInUse[Slot] = true;
           }
           if (!Starts[Slot].isValid())
             Starts[Slot] = ThisIndex;
+          if (StartIdx[Slot] == -1)
+            StartIdx[Slot] = CurrIdx;
         } else {
+          assert(Starts[Slot].isValid() == (StartIdx[Slot] != -1));
           if (Starts[Slot].isValid()) {
             VNInfo *VNI = Intervals[Slot]->getValNumInfo(0);
             Intervals[Slot]->addSegment(
@@ -905,10 +1013,18 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
             Starts[Slot] = SlotIndex(); // Invalidate the start index
             DefinitelyInUse[Slot] = false;
           }
+          if (StartIdx[Slot] != -1)
+            EndRangeFor(Slot);
         }
       }
     }
 
+    for (unsigned i = 0; i < NumSlots; ++i) {
+      if (StartIdx[i] == -1)
+        continue;
+      EndRangeFor(i);
+    }
+
     // Finish up started segments
     for (unsigned i = 0; i < NumSlots; ++i) {
       if (!Starts[i].isValid())
@@ -919,6 +1035,18 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI));
     }
   }
+  LivenessSize = CurrIdx;
+  for (SlotInfo &Info : Slot2Info) {
+    Info.Liveness.resize(CurrIdx);
+
+    // This is only to make us index into Liveness in order when doing a
+    // SlotInfo::hasOverlap, which should have better cache locality
+    std::sort(Info.StartLiveness.begin(), Info.StartLiveness.end());
+#ifndef NDEBUG
+    for (int Start : Info.StartLiveness)
+      assert(Info.Liveness[Start]);
+#endif
+  }
 }
 
 bool StackColoring::removeAllMarkers() {
@@ -944,7 +1072,7 @@ bool StackColoring::removeAllMarkers() {
   return Count;
 }
 
-void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
+void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedSlot) {
   unsigned FixedInstr = 0;
   unsigned FixedMemOp = 0;
   unsigned FixedDbg = 0;
@@ -954,6 +1082,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     if (!VI.Var || !VI.inStackSlot())
       continue;
     int Slot = VI.getStackSlot();
+    if (Slot >= 0 && Slot2Info[Slot].Offset != InvalidIdx) {
+      // FIXME: properly update the offset into MergedSlot debug
+      VI.updateStackSlot(MergedSlot);
+    }
     if (auto It = SlotRemap.find(Slot); It != SlotRemap.end()) {
       LLVM_DEBUG(dbgs() << "Remapping debug info for ["
                         << cast<DILocalVariable>(VI.Var)->getName() << "].\n");
@@ -1062,6 +1194,12 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         if (FromSlot<0)
           continue;
 
+        if (FromSlot >= 0 && Slot2Info[FromSlot].Offset != InvalidIdx) {
+          MO.setIndex(MergedSlot);
+          MO.setOffset(MO.getOffset() + Slot2Info[FromSlot].Offset);
+          continue;
+        }
+
         // Only look at mapped slots.
         if (!SlotRemap.count(FromSlot))
           continue;
@@ -1103,6 +1241,8 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
           auto To = SlotRemap.find(FI);
           if (To != SlotRemap.end())
             SSRefs[FI].push_back(MMO);
+          if (FI >= 0 && Slot2Info[FI].Offset != InvalidIdx)
+            SSRefs[FI].push_back(MMO);
         }
 
         // If this memory location can be a slot remapped here,
@@ -1121,7 +1261,7 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
                 // that is not remapped, we continue checking.
                 // Otherwise, we need to invalidate AA infomation.
                 const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V);
-                if (AI && MergedAllocas.count(AI)) {
+                if ((AI && MergedAllocas.count(AI)) || UseNewStackColoring) {
                   MayHaveConflictingAAMD = true;
                   break;
                 }
@@ -1145,10 +1285,17 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   // Rewrite MachineMemOperands that reference old frame indices.
   for (auto E : enumerate(SSRefs))
     if (!E.value().empty()) {
-      const PseudoSourceValue *NewSV =
-          MF->getPSVManager().getFixedStack(SlotRemap.find(E.index())->second);
-      for (MachineMemOperand *Ref : E.value())
-        Ref->setValue(NewSV);
+      if (UseNewStackColoring) {
+        const PseudoSourceValue *NewSV =
+            MF->getPSVManager().getFixedStack(MergedSlot);
+        for (MachineMemOperand *Ref : E.value())
+          Ref->setValue(NewSV);
+      } else {
+        const PseudoSourceValue *NewSV = MF->getPSVManager().getFixedStack(
+            SlotRemap.find(E.index())->second);
+        for (MachineMemOperand *Ref : E.value())
+          Ref->setValue(NewSV);
+      }
     }
 
   // Update the location of C++ catch objects for the MSVC personality routine.
@@ -1245,6 +1392,195 @@ PreservedAnalyses StackColoringPass::run(MachineFunction &MF,
   return PreservedAnalyses::all();
 }
 
+unsigned StackColoring::doMerging(unsigned NumSlots) {
+  SmallVector<unsigned> SlotStack;
+  Align FinalAlign;
+
+  int64_t OrigOptSize = 0;
+  int64_t OrigPesSize = 0;
+  for (unsigned Slot = 0; Slot < NumSlots; Slot++) {
+    SlotInfo& Info = Slot2Info[Slot];
+    if (!Info.StartLiveness.empty() &&
+        DebugCounter::shouldExecute(ProcessSlot)) {
+      FinalAlign = std::max(FinalAlign, Info.Align);
+
+      // Note: This is maybe not a completely fair comparaison compared to the
+      // previous algo, as PEI should be smarter than that about alignment But
+      // faire comparaison is hard since the old algo doesn't deal in alignment
+      // at all
+      OrigPesSize = alignTo(OrigPesSize, Info.Align);
+      OrigPesSize += Info.Size;
+      OrigOptSize += Info.Size;
+      SlotStack.push_back(Slot);
+    }
+  }
+
+  if (SlotStack.size() <= 1)
+    return InvalidIdx;
+
+  // This Whole block is only used to try and order the stack, such that the
+  // Slots are processed in an order that helps getting good packing
+  {
+    // Find how much usage of every livepoint there is.
+    SmallVector<unsigned> CumulatedUsage;
+    CumulatedUsage.resize(LivenessSize, 0);
+
+    for (unsigned Idx = 0; Idx < SlotStack.size(); Idx++) {
+      SlotInfo &Info = Slot2Info[SlotStack[Idx]];
+      for (unsigned Pt : Info.Liveness.set_bits()) {
+        CumulatedUsage[Pt] += Info.Size;
+      }
+    }
+
+    for (unsigned Idx = 0; Idx < SlotStack.size(); Idx++) {
+      SlotInfo &Info = Slot2Info[SlotStack[Idx]];
+      for (unsigned Pt : Info.Liveness.set_bits()) {
+        // Since the goal is to minimize the max usage, blocks that are in high
+        // contention areas are given more priority
+        Info.SlotPriority +=
+            (uint64_t)CumulatedUsage[Pt] * (uint64_t)CumulatedUsage[Pt] +
+            (uint64_t)Info.Size * (uint64_t)Info.Align.value();
+      }
+    }
+    std::stable_sort(
+        SlotStack.begin(), SlotStack.end(), [&](unsigned Lhs, unsigned Rhs) {
+          if (Lhs == InvalidIdx)
+            return false;
+          if (Rhs == InvalidIdx)
+            return true;
+          return Slot2Info[Lhs].SlotPriority < Slot2Info[Rhs].SlotPriority;
+        });
+  }
+
+  SlotInfo* LastQueryLhs = nullptr;
+  SlotInfo* LastQueryRhs = nullptr;
+  bool LastQueryRes = false;
+  auto HasOverlapCached = [&](SlotInfo &Lhs, SlotInfo &Rhs) {
+    if (&Lhs == LastQueryLhs && LastQueryRhs == &Rhs)
+      return LastQueryRes;
+    LastQueryLhs = &Lhs;
+    LastQueryRhs = &Rhs;
+    LastQueryRes = Lhs.hasOverlap(Rhs);
+    return LastQueryRes;
+  };
+
+  struct Status {
+    unsigned Offset = 0;
+    unsigned Slot = InvalidIdx;
+    unsigned Prev = InvalidIdx;
+  };
+
+  SmallVector<Status> LatestStatus;
+  LatestStatus.resize(LivenessSize, Status{});
+  SmallVector<Status> OlderStatus;
+
+  auto FindOffset = [&](SlotInfo &Info, unsigned Pt) {
+    Status *Last = &LatestStatus[Pt];
+
+    // This is only called on Slot that have overlapping lifetimes
+    // So the no overlap only happens when there lifetime overlap but only one
+    // can be live because where they start in the CFG is mutually exclusive
+    // See the comment about implementation for an example
+    while (LLVM_UNLIKELY(Last->Slot != InvalidIdx &&
+                         !HasOverlapCached(Info, Slot2Info[Last->Slot])))
+      Last = &OlderStatus[Last->Prev];
+    return Last->Offset;
+  };
+  auto UpdateOffset = [&](SlotInfo &Info, unsigned Pt, unsigned Offset) {
+    Status& Last = LatestStatus[Pt];
+    unsigned Idx = OlderStatus.size();
+    OlderStatus.push_back(Last);
+    Last.Prev = Idx;
+    Last.Offset = Offset;
+    Last.Slot = &Info - Slot2Info.data();
+  };
+
+  SmallVector<unsigned, MaxCandidatesToConsiderDefault> Candidates;
+  unsigned MaxCandidates =
+      MaxCandidatesToConsider == 0 ? ~0u : MaxCandidatesToConsider;
+  for (unsigned I = 0; I < MaxCandidates; I++) {
+    if (SlotStack.empty())
+      break;
+    Candidates.push_back(SlotStack.pop_back_val());
+  }
+
+  while (!Candidates.empty()) {
+    int64_t BestScore = std::numeric_limits<int64_t>::max();
+    unsigned BestIdx = InvalidIdx;
+    unsigned BestOffset = InvalidIdx;
+
+    for (unsigned K = 0; K < Candidates.size(); K++) {
+      SlotInfo &Info = Slot2Info[Candidates[K]];
+      unsigned Offset = 0;
+      for (unsigned Pt : Info.Liveness.set_bits())
+        Offset = std::max(Offset, FindOffset(Info, Pt));
+
+      Offset = alignTo(Offset, Info.Align);
+
+      int64_t Score = (int64_t)Offset - (int64_t)Log2(Info.Align);
+      LLVM_DEBUG(dbgs() << "SlotInfo(" << Candidates[K] << ") Score=" << Score << "\n");
+      bool IsBetter = [&] {
+        if (BestScore != Score)
+          return BestScore > Score;
+        SlotInfo &Other = Slot2Info[Candidates[K]];
+        if (Other.Size != Info.Size)
+          return Other.Size < Info.Size;
+        if (Other.SlotPriority != Info.SlotPriority)
+          return Other.SlotPriority < Info.SlotPriority;
+
+        // Both are always stored in Slot2Info, so this is deterministic
+        return &Other < &Info;
+      }();
+
+      if (IsBetter) {
+        BestScore = Score;
+        BestIdx = K;
+        BestOffset = Offset;
+      }
+    }
+    SlotInfo &Info = Slot2Info[Candidates[BestIdx]];
+
+    LLVM_DEBUG(Info.dump(this));
+    LLVM_DEBUG(dbgs() << "Placing SlotInfo(" << Candidates[BestIdx] << ") at "
+                      << BestOffset << " Score=" << BestScore << "\n");
+
+    Info.Offset = BestOffset;
+    for (unsigned Pt : Info.Liveness.set_bits())
+      UpdateOffset(Info, Pt, BestOffset + Info.Size);
+
+    std::swap(Candidates[BestIdx], Candidates.back());
+    Candidates.pop_back();
+    if (!SlotStack.empty())
+      Candidates.push_back(SlotStack.pop_back_val());
+  }
+
+  unsigned FinalSize = 0;
+  for (Status& U : LatestStatus)
+    FinalSize = std::max(FinalSize, U.Offset);
+  LLVM_DEBUG(dbgs() << "MergedSize=" << FinalSize << " OrigPesSize="
+                    << OrigPesSize << " OrigOptSize" << OrigOptSize << "\n");
+  if (FinalSize >= OrigPesSize) {
+    GeneratedWorse++;
+    return InvalidIdx;
+  }
+
+  int MergedSlot =
+      MFI->CreateStackObject(FinalSize, FinalAlign, /*isSpillSlot=*/false);
+  MFI->setUnderlyingSlot(MergedSlot, MachineFrameInfo::IsUnderlyingSlot);
+
+  for (unsigned Slot = 0; Slot < NumSlots; Slot++)
+    if (Slot2Info[Slot].Offset != InvalidIdx) {
+      MFI->setUnderlyingSlot(Slot, MergedSlot);
+      MFI->setObjectOffset(Slot, Slot2Info[Slot].Offset);
+    }
+
+  // Note: this is counts differently from the previous algo because this logic
+  // cares about alignment, while the older algo doesn't.
+  StackSpaceSaved += OrigPesSize - FinalSize;
+
+  return MergedSlot;
+}
+
 bool StackColoring::run(MachineFunction &Func) {
   LLVM_DEBUG(dbgs() << "********** Stack Coloring **********\n"
                     << "********** Function: " << Func.getName() << '\n');
@@ -1256,11 +1592,12 @@ bool StackColoring::run(MachineFunction &Func) {
   Intervals.clear();
   LiveStarts.clear();
   VNInfoAllocator.Reset();
+  Slot2Info.clear();
 
   unsigned NumSlots = MFI->getObjectIndexEnd();
 
   // If there are no stack slots then there are no markers to remove.
-  if (!NumSlots || DisableColoring)
+  if (NumSlots < 2 || DisableColoring)
     return removeAllMarkers();
 
   SmallVector<int, 8> SortedSlots;
@@ -1290,11 +1627,16 @@ bool StackColoring::run(MachineFunction &Func) {
     return removeAllMarkers();
   }
 
+  Slot2Info.resize(NumSlots);
   for (unsigned i=0; i < NumSlots; ++i) {
     std::unique_ptr<LiveRange> LI(new LiveRange());
     LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
     Intervals.push_back(std::move(LI));
     SortedSlots.push_back(i);
+
+    Slot2Info[i].Align = MFI->getObjectAlign(i);
+    Slot2Info[i].Size = MFI->getObjectSize(i);
+    Slot2Info[i].Offset = InvalidIdx;
   }
 
   // Calculate the liveness of each block.
@@ -1311,105 +1653,125 @@ bool StackColoring::run(MachineFunction &Func) {
   if (ProtectFromEscapedAllocas)
     removeInvalidSlotRanges();
 
-  // Maps old slots to new slots.
-  DenseMap<int, int> SlotRemap;
-  unsigned RemovedSlots = 0;
-  unsigned ReducedSize = 0;
+  if (!UseNewStackColoring) {
+    // Maps old slots to new slots.
+    DenseMap<int, int> SlotRemap;
+    unsigned RemovedSlots = 0;
+    unsigned ReducedSize = 0;
 
-  // Do not bother looking at empty intervals.
-  for (unsigned I = 0; I < NumSlots; ++I) {
-    if (Intervals[SortedSlots[I]]->empty())
-      SortedSlots[I] = -1;
-  }
-
-  // This is a simple greedy algorithm for merging allocas. First, sort the
-  // slots, placing the largest slots first. Next, perform an n^2 scan and look
-  // for disjoint slots. When you find disjoint slots, merge the smaller one
-  // into the bigger one and update the live interval. Remove the small alloca
-  // and continue.
-
-  // Sort the slots according to their size. Place unused slots at the end.
-  // Use stable sort to guarantee deterministic code generation.
-  llvm::stable_sort(SortedSlots, [this](int LHS, int RHS) {
-    // We use -1 to denote a uninteresting slot. Place these slots at the end.
-    if (LHS == -1)
-      return false;
-    if (RHS == -1)
-      return true;
-    // Sort according to size.
-    return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
-  });
-
-  for (auto &s : LiveStarts)
-    llvm::sort(s);
-
-  bool Changed = true;
-  while (Changed) {
-    Changed = false;
+    // Do not bother looking at empty intervals.
     for (unsigned I = 0; I < NumSlots; ++I) {
-      if (SortedSlots[I] == -1)
-        continue;
+      if (Intervals[SortedSlots[I]]->empty())
+        SortedSlots[I] = -1;
+    }
 
-      for (unsigned J=I+1; J < NumSlots; ++J) {
-        if (SortedSlots[J] == -1)
+    // This is a simple greedy algorithm for merging allocas. First, sort the
+    // slots, placing the largest slots first. Next, perform an n^2 scan and
+    // look for disjoint slots. When you find disjoint slots, merge the smaller
+    // one into the bigger one and update the live interval. Remove the small
+    // alloca and continue.
+
+    // Sort the slots according to their size. Place unused slots at the end.
+    // Use stable sort to guarantee deterministic code generation.
+    llvm::stable_sort(SortedSlots, [this](int LHS, int RHS) {
+      // We use -1 to denote a uninteresting slot. Place these slots at the end.
+      if (LHS == -1)
+        return false;
+      if (RHS == -1)
+        return true;
+      // Sort according to size.
+      return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
+    });
+
+    for (auto &s : LiveStarts)
+      llvm::sort(s);
+
+    bool Changed = true;
+    while (Changed) {
+      Changed = false;
+      for (unsigned I = 0; I < NumSlots; ++I) {
+        if (SortedSlots[I] == -1)
           continue;
 
-        int FirstSlot = SortedSlots[I];
-        int SecondSlot = SortedSlots[J];
+        for (unsigned J = I + 1; J < NumSlots; ++J) {
+          if (SortedSlots[J] == -1)
+            continue;
 
-        // Objects with different stack IDs cannot be merged.
-        if (MFI->getStackID(FirstSlot) != MFI->getStackID(SecondSlot))
-          continue;
+          int FirstSlot = SortedSlots[I];
+          int SecondSlot = SortedSlots[J];
 
-        LiveRange *First = &*Intervals[FirstSlot];
-        LiveRange *Second = &*Intervals[SecondSlot];
-        auto &FirstS = LiveStarts[FirstSlot];
-        auto &SecondS = LiveStarts[SecondSlot];
-        assert(!First->empty() && !Second->empty() && "Found an empty range");
-
-        // Merge disjoint slots. This is a little bit tricky - see the
-        // Implementation Notes section for an explanation.
-        if (!First->isLiveAtIndexes(SecondS) &&
-            !Second->isLiveAtIndexes(FirstS)) {
-          Changed = true;
-          First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
-
-          int OldSize = FirstS.size();
-          FirstS.append(SecondS.begin(), SecondS.end());
-          auto Mid = FirstS.begin() + OldSize;
-          std::inplace_merge(FirstS.begin(), Mid, FirstS.end());
-
-          SlotRemap[SecondSlot] = FirstSlot;
-          SortedSlots[J] = -1;
-          LLVM_DEBUG(dbgs() << "Merging #" << FirstSlot << " and slots #"
-                            << SecondSlot << " together.\n");
-          Align MaxAlignment = std::max(MFI->getObjectAlign(FirstSlot),
-                                        MFI->getObjectAlign(SecondSlot));
-
-          assert(MFI->getObjectSize(FirstSlot) >=
-                 MFI->getObjectSize(SecondSlot) &&
-                 "Merging a small object into a larger one");
-
-          RemovedSlots+=1;
-          ReducedSize += MFI->getObjectSize(SecondSlot);
-          MFI->setObjectAlignment(FirstSlot, MaxAlignment);
-          MFI->RemoveStackObject(SecondSlot);
+          // Objects with different stack IDs cannot be merged.
+          if (MFI->getStackID(FirstSlot) != MFI->getStackID(SecondSlot))
+            continue;
+
+          LiveRange *First = &*Intervals[FirstSlot];
+          LiveRange *Second = &*Intervals[SecondSlot];
+          auto &FirstS = LiveStarts[FirstSlot];
+          auto &SecondS = LiveStarts[SecondSlot];
+          assert(!First->empty() && !Second->empty() && "Found an empty range");
+
+          bool OldNoOverlap = !First->isLiveAtIndexes(SecondS) &&
+                              !Second->isLiveAtIndexes(FirstS);
+
+          SlotInfo &FSlot = Slot2Info[FirstSlot];
+          SlotInfo &SSlot = Slot2Info[SecondSlot];
+          bool NewNoOverlap = !FSlot.hasOverlap(SSlot);
+
+          // if (NewNoOverlap != OldNoOverlap) {
+          //   LLVM_DEBUG(dbgs() << "OldNoOverlap=" << OldNoOverlap
+          //                     << " NewNoOverlap=" << NewNoOverlap << "\n");
+          // }
+          // assert(OldNoOverlap == NewNoOverlap);
+
+          // Merge disjoint slots. This is a little bit tricky - see the
+          // Implementation Notes section for an explanation.
+          if (OldNoOverlap) {
+            Changed = true;
+            First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
+
+            int OldSize = FirstS.size();
+            FirstS.append(SecondS.begin(), SecondS.end());
+            auto Mid = FirstS.begin() + OldSize;
+            std::inplace_merge(FirstS.begin(), Mid, FirstS.end());
+
+            // FSlot.Liveness |= SSlot.Liveness;
+
+            SlotRemap[SecondSlot] = FirstSlot;
+            SortedSlots[J] = -1;
+            LLVM_DEBUG(dbgs() << "Merging #" << FirstSlot << " and slots #"
+                              << SecondSlot << " together.\n");
+            Align Alignment = std::max(MFI->getObjectAlign(FirstSlot),
+                                       MFI->getObjectAlign(SecondSlot));
+
+            assert(MFI->getObjectSize(FirstSlot) >=
+                       MFI->getObjectSize(SecondSlot) &&
+                   "Merging a small object into a larger one");
+
+            RemovedSlots += 1;
+            ReducedSize += MFI->getObjectSize(SecondSlot);
+            MFI->setObjectAlignment(FirstSlot, Alignment);
+            MFI->RemoveStackObject(SecondSlot);
+          }
         }
       }
+    } // While changed.
+
+    // Record statistics.
+    StackSpaceSaved += ReducedSize;
+    StackSlotMerged += RemovedSlots;
+    LLVM_DEBUG(dbgs() << "Merge " << RemovedSlots << " slots. Saved "
+                      << ReducedSize << " bytes\n");
+
+    // Scan the entire function and update all machine operands that use frame
+    // indices to use the remapped frame index.
+    if (!SlotRemap.empty()) {
+      expungeSlotMap(SlotRemap, NumSlots);
+      remapInstructions(SlotRemap, InvalidIdx);
     }
-  }// While changed.
-
-  // Record statistics.
-  StackSpaceSaved += ReducedSize;
-  StackSlotMerged += RemovedSlots;
-  LLVM_DEBUG(dbgs() << "Merge " << RemovedSlots << " slots. Saved "
-                    << ReducedSize << " bytes\n");
-
-  // Scan the entire function and update all machine operands that use frame
-  // indices to use the remapped frame index.
-  if (!SlotRemap.empty()) {
-    expungeSlotMap(SlotRemap, NumSlots);
-    remapInstructions(SlotRemap);
+  } else {
+    // Maybe this entire logic should be moved to a generic StackLayouter that
+    // is used for PrologEpilogInserter and LocalStackSlotAllocation.
+    doMerging(NumSlots);
   }
 
   return removeAllMarkers();

>From 09de6f3c4cb8828cffd607d8b2fa75decdc3e779 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Mon, 9 Jun 2025 19:23:03 +0200
Subject: [PATCH 13/19] Start rebuild lifetimes for spill slots

---
 llvm/lib/CodeGen/StackColoring.cpp    | 152 +++++++++++++++++++++++++-
 llvm/lib/CodeGen/TargetPassConfig.cpp |   6 +-
 2 files changed, 150 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 4cfdc678643f8..0c876fe195fa3 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -399,6 +400,8 @@ class StackColoring {
   MachineFrameInfo *MFI = nullptr;
   MachineFunction *MF = nullptr;
 
+  LiveStacks* LS = nullptr;
+
   struct SlotInfo {
     // All places in the current function where this Slot is live
     BitVector Liveness;
@@ -484,7 +487,7 @@ class StackColoring {
   unsigned NumIterations;
 
 public:
-  StackColoring(SlotIndexes *Indexes) : Indexes(Indexes) {}
+  StackColoring(SlotIndexes *Indexes, LiveStacks* LS) : LS(LS), Indexes(Indexes) {}
   bool run(MachineFunction &Func);
 
 private:
@@ -573,6 +576,7 @@ INITIALIZE_PASS_END(StackColoringLegacy, DEBUG_TYPE,
 
 void StackColoringLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<SlotIndexesWrapperPass>();
+  AU.addUsedIfAvailable<LiveStacksWrapperLegacy>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -744,6 +748,9 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   ConservativeSlots.clear();
   ConservativeSlots.resize(NumSlot);
 
+  if (LS)
+    MarkersFound += LS->getNumIntervals() * 2;
+
   // number of start and end lifetime ops for each slot
   SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
   SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);
@@ -955,6 +962,113 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
   int CurrIdx = 0;
 
   DefinitelyInUse.resize(NumSlots);
+  struct SplitSlotChanges {
+    const MachineInstr* AtMI;
+    unsigned BlockIdx : 31;
+    unsigned IsStart : 1;
+    unsigned Slot;
+  };
+  SmallVector<SplitSlotChanges> MidBlockSpillChanges;
+  unsigned SpillChangeCounter = 0;
+
+  if (LS && LS->getNumIntervals()) {
+    for (const MachineBasicBlock &MBB : *MF) {
+      BlockLifetimeInfo &MBBLiveness = BlockLiveness[MBB.getNumber()];
+      MBBLiveness.LiveIn.resize(NumSlots);
+      MBBLiveness.LiveOut.resize(NumSlots);
+    }
+    for (const MachineBasicBlock &MBB : *MF) {
+      unsigned Base = LS->getStartIdx();
+      BlockLifetimeInfo &MBBLiveness = BlockLiveness[MBB.getNumber()];
+      for (unsigned I = 0; I < LS->getNumIntervals(); I++) {
+        unsigned Slot = Base + I;
+        if (LS->getInterval(Slot).liveAt(Indexes->getMBBStartIdx(&MBB))) {
+          MBBLiveness.LiveIn[Slot] = true;
+          // Checking if the end of the block is in the live-range is not
+          // reliable
+          for (MachineBasicBlock *Pred : MBB.predecessors())
+            BlockLiveness[Pred->getNumber()].LiveOut[Slot] = true;
+        }
+      }
+    }
+    for (const MachineBasicBlock &MBB : *MF) {
+      unsigned SizeOnStart = MidBlockSpillChanges.size();
+      BlockLifetimeInfo &MBBLiveness = BlockLiveness[MBB.getNumber()];
+      BitVector IsStoredTo;
+      IsStoredTo.resize(NumSlots, false);
+      struct MIBlockIdx {
+        const MachineInstr* MI;
+        unsigned BlockIdx;
+      };
+      unsigned BlockIdx = 0;
+      SmallVector<MIBlockIdx> LastUse;
+      LastUse.resize(NumSlots, {nullptr, 0});
+      for (const MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr())
+          continue;
+        for (MachineMemOperand* MMO : MI.memoperands()) {
+          auto *PSV = dyn_cast_if_present<FixedStackPseudoSourceValue>(
+              MMO->getPseudoValue());
+          if (!PSV)
+            continue;
+          unsigned Slot = PSV->getFrameIndex();
+          if (!LS->hasInterval(Slot))
+            continue;
+          // if (Slot == 17) {
+          //   dbgs() << "MI: " << MI;
+          //   dbgs() << "MBB: " << MBB.getName() << "\n";
+          //   dbgs() << "MBB range:" << Indexes->getMBBRange(&MBB).first << "-"
+          //          << Indexes->getMBBRange(&MBB).second << "\n";
+          //   dbgs() << "slot range: " << LS->getInterval(Slot) << "\n";
+          //   dbgs() << "\n";
+          // }
+          assert(MMO->isStore() != MMO->isLoad());
+          if (MMO->isStore()) {
+            if (!IsStoredTo[Slot]) {
+              MidBlockSpillChanges.push_back(
+                  {&MI, BlockIdx, /*IsStart=*/true, Slot});
+              IsStoredTo[Slot] = true;
+            }
+          } else
+            LastUse[Slot] = {&MI, BlockIdx};
+        }
+        BlockIdx++;
+      }
+
+      BitVector Liveness = MBBLiveness.LiveIn;
+      Liveness |= IsStoredTo;
+      Liveness &= MBBLiveness.LiveOut.flip();
+      for (unsigned Slot : Liveness.set_bits()) {
+        if (!LS->hasInterval(Slot))
+          continue;
+        if (LastUse[Slot].MI)
+          MidBlockSpillChanges.push_back({LastUse[Slot].MI,
+                                          LastUse[Slot].BlockIdx,
+                                          /*IsStart=*/false, Slot});
+      }
+
+      std::stable_sort(MidBlockSpillChanges.begin() + SizeOnStart,
+                       MidBlockSpillChanges.end(),
+                       [&](SplitSlotChanges Lhs, SplitSlotChanges Rhs) -> bool {
+                         if (Lhs.BlockIdx == Rhs.BlockIdx)
+                           assert(Lhs.Slot != Rhs.Slot);
+                         if (Lhs.BlockIdx != Rhs.BlockIdx)
+                           return Lhs.BlockIdx < Rhs.BlockIdx;
+                         // Avoid overlap of lifetime when the same instruction
+                         // starts some spill lifetime and ends others.
+                         return Rhs.IsStart;
+                       });
+    }
+  }
+  LLVM_DEBUG({
+    for (SplitSlotChanges C : MidBlockSpillChanges) {
+        dbgs() << "Idx=" << C.BlockIdx << " Slot=" << C.Slot
+               << " IsStart=" << C.IsStart << " MI=" << *C.AtMI;
+    }
+  });
+
+  // To avoid needing bounds checks
+  MidBlockSpillChanges.push_back({nullptr, 0, false, InvalidIdx});
 
   // For each block, find which slots are active within this block
   // and update the live intervals.
@@ -986,10 +1100,15 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
     for (const MachineInstr &MI : MBB) {
       SmallVector<int, 4> slots;
       bool IsStart = false;
-      if (!isLifetimeStartOrEnd(MI, slots, IsStart))
+      bool AnyChange = isLifetimeStartOrEnd(MI, slots, IsStart);
+      AnyChange |= MidBlockSpillChanges[SpillChangeCounter].AtMI == &MI;
+      if (!AnyChange)
         continue;
       SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
-      for (auto Slot : slots) {
+      auto OnChange = [&](unsigned Slot, bool IsStart) {
+        // if (Slot == 3) {
+        //   outs() << "HERE\n";
+        // }
         if (IsStart) {
           StartedSinceInc = true;
           // If a slot is already definitely in use, we don't have to emit
@@ -1016,6 +1135,14 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
           if (StartIdx[Slot] != -1)
             EndRangeFor(Slot);
         }
+      };
+      for (auto Slot : slots)
+        OnChange(Slot, IsStart);
+      for (; SpillChangeCounter < MidBlockSpillChanges.size() &&
+             MidBlockSpillChanges[SpillChangeCounter].AtMI == &MI;
+           SpillChangeCounter++) {
+        SplitSlotChanges Change = MidBlockSpillChanges[SpillChangeCounter];
+        OnChange(Change.Slot, Change.IsStart);
       }
     }
 
@@ -1035,6 +1162,9 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI));
     }
   }
+  // Make sure we reached the end
+  assert(!MidBlockSpillChanges[SpillChangeCounter].AtMI);
+
   LivenessSize = CurrIdx;
   for (SlotInfo &Info : Slot2Info) {
     Info.Liveness.resize(CurrIdx);
@@ -1043,6 +1173,7 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
     // SlotInfo::hasOverlap, which should have better cache locality
     std::sort(Info.StartLiveness.begin(), Info.StartLiveness.end());
 #ifndef NDEBUG
+    assert(Info.Liveness.any() == !Info.StartLiveness.empty());
     for (int Start : Info.StartLiveness)
       assert(Info.Liveness[Start]);
 #endif
@@ -1380,13 +1511,19 @@ bool StackColoringLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  StackColoring SC(&getAnalysis<SlotIndexesWrapperPass>().getSI());
+  LiveStacks* LS = nullptr;
+  LiveStacksWrapperLegacy* LSWL = getAnalysisIfAvailable<LiveStacksWrapperLegacy>();
+  if (LSWL)
+    LS = &LSWL->getLS();
+
+  StackColoring SC(&getAnalysis<SlotIndexesWrapperPass>().getSI(), LS);
   return SC.run(MF);
 }
 
 PreservedAnalyses StackColoringPass::run(MachineFunction &MF,
                                          MachineFunctionAnalysisManager &MFAM) {
-  StackColoring SC(&MFAM.getResult<SlotIndexesAnalysis>(MF));
+  StackColoring SC(&MFAM.getResult<SlotIndexesAnalysis>(MF),
+                   MFAM.getCachedResult<LiveStacksAnalysis>(MF));
   if (SC.run(MF))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
@@ -1400,6 +1537,8 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   int64_t OrigPesSize = 0;
   for (unsigned Slot = 0; Slot < NumSlots; Slot++) {
     SlotInfo& Info = Slot2Info[Slot];
+    if (Info.StartLiveness.empty())
+      assert(!LS || !LS->hasInterval(Slot));
     if (!Info.StartLiveness.empty() &&
         DebugCounter::shouldExecute(ProcessSlot)) {
       FinalAlign = std::max(FinalAlign, Info.Align);
@@ -1596,6 +1735,9 @@ bool StackColoring::run(MachineFunction &Func) {
 
   unsigned NumSlots = MFI->getObjectIndexEnd();
 
+  // if (MF->getName() == "_ZL9transformPjS_Rm")
+  //   outs() << "HERE\n";
+
   // If there are no stack slots then there are no markers to remove.
   if (NumSlots < 2 || DisableColoring)
     return removeAllMarkers();
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 864c7c8acd3b2..343e25ae17fd7 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1498,13 +1498,13 @@ void TargetPassConfig::addOptimizedRegAlloc() {
   addPass(&MachineSchedulerID);
 
   if (addRegAssignAndRewriteOptimized()) {
-    // Perform stack slot coloring and post-ra machine LICM.
-    addPass(&StackSlotColoringID);
-
     if (MergedStackColoring) {
       // This pass merges large allocas. StackSlotColoring is a different pass
       // which merges spill slots.
       addPass(&StackColoringLegacyID);
+    } else {
+      // Perform stack slot coloring and post-ra machine LICM.
+      addPass(&StackSlotColoringID);
     }
 
     // Allow targets to expand pseudo instructions depending on the choice of

>From 78e9bca2c7b30b5dd242ee112379b214bf11eee0 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Tue, 10 Jun 2025 17:30:27 +0200
Subject: [PATCH 14/19] Fix bug + add comments + reduce ammount of debug prints

---
 llvm/lib/CodeGen/StackColoring.cpp | 96 ++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 0c876fe195fa3..b9a3b5b3bc8c8 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -648,9 +648,6 @@ LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
       dbgs() << ' ' << SIdx;
     dbgs() << '\n';
   }
-  for (unsigned Slot = 0; Slot < Slot2Info.size(); Slot++) {
-    Slot2Info[Slot].dump(this);
-  }
 }
 
 LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State) const {
@@ -1060,12 +1057,6 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
                        });
     }
   }
-  LLVM_DEBUG({
-    for (SplitSlotChanges C : MidBlockSpillChanges) {
-        dbgs() << "Idx=" << C.BlockIdx << " Slot=" << C.Slot
-               << " IsStart=" << C.IsStart << " MI=" << *C.AtMI;
-    }
-  });
 
   // To avoid needing bounds checks
   MidBlockSpillChanges.push_back({nullptr, 0, false, InvalidIdx});
@@ -1583,10 +1574,6 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     }
     std::stable_sort(
         SlotStack.begin(), SlotStack.end(), [&](unsigned Lhs, unsigned Rhs) {
-          if (Lhs == InvalidIdx)
-            return false;
-          if (Rhs == InvalidIdx)
-            return true;
           return Slot2Info[Lhs].SlotPriority < Slot2Info[Rhs].SlotPriority;
         });
   }
@@ -1594,6 +1581,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   SlotInfo* LastQueryLhs = nullptr;
   SlotInfo* LastQueryRhs = nullptr;
   bool LastQueryRes = false;
+  // TODO: Real caching ?
   auto HasOverlapCached = [&](SlotInfo &Lhs, SlotInfo &Rhs) {
     if (&Lhs == LastQueryLhs && LastQueryRhs == &Rhs)
       return LastQueryRes;
@@ -1604,8 +1592,14 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   };
 
   struct Status {
+    // This is the offset at which a slot on top should be placed. So the offset
+    // of the slot + the size of the slot
     unsigned Offset = 0;
+
+    // The Slot just below the offset.
     unsigned Slot = InvalidIdx;
+
+    // The index of the previous status in OlderStatus
     unsigned Prev = InvalidIdx;
   };
 
@@ -1616,22 +1610,41 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   auto FindOffset = [&](SlotInfo &Info, unsigned Pt) {
     Status *Last = &LatestStatus[Pt];
 
-    // This is only called on Slot that have overlapping lifetimes
-    // So the no overlap only happens when there lifetime overlap but only one
-    // can be live because where they start in the CFG is mutually exclusive
-    // See the comment about implementation for an example
+    // The slots in the linked-list are always kept in ascending order, so the
+    // earliest slot has the lowest offset
+    // This loop handles cases where the latest slot doesn't cannot be both live
+    // because of the CFG, so even if there lifetime overlap, they can overlap
     while (LLVM_UNLIKELY(Last->Slot != InvalidIdx &&
                          !HasOverlapCached(Info, Slot2Info[Last->Slot])))
       Last = &OlderStatus[Last->Prev];
     return Last->Offset;
   };
   auto UpdateOffset = [&](SlotInfo &Info, unsigned Pt, unsigned Offset) {
-    Status& Last = LatestStatus[Pt];
+    Status* Last = &LatestStatus[Pt];
     unsigned Idx = OlderStatus.size();
-    OlderStatus.push_back(Last);
-    Last.Prev = Idx;
-    Last.Offset = Offset;
-    Last.Slot = &Info - Slot2Info.data();
+    OlderStatus.push_back(*Last);
+
+    // this is branch is not taken only when we are inserting a slot that wasn't
+    // overlapping with the previous slot and is smaller. so the slot inserted
+    // slot is not the new start of the linked-list
+    if (LLVM_LIKELY(Last->Offset <= Offset)) {
+      Last->Prev = Idx;
+      Last->Offset = Offset;
+      Last->Slot = &Info - Slot2Info.data();
+      return;
+    }
+
+    // Insure ordering of slots
+    Status* Inserted = &OlderStatus.back();
+    Inserted->Offset = Offset;
+    Inserted->Slot = &Info - Slot2Info.data();
+    Status *Curr = Last;
+    while (Curr->Prev != InvalidIdx && OlderStatus[Curr->Prev].Offset > Offset)
+      Curr = &OlderStatus[Curr->Prev];
+
+    // Insert the new node in the linked-list
+    Inserted->Prev = Curr->Prev;
+    Curr->Prev = Idx;
   };
 
   SmallVector<unsigned, MaxCandidatesToConsiderDefault> Candidates;
@@ -1643,25 +1656,34 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     Candidates.push_back(SlotStack.pop_back_val());
   }
 
+  unsigned WorseCaseOffset = 0;
   while (!Candidates.empty()) {
-    int64_t BestScore = std::numeric_limits<int64_t>::max();
     unsigned BestIdx = InvalidIdx;
     unsigned BestOffset = InvalidIdx;
 
     for (unsigned K = 0; K < Candidates.size(); K++) {
       SlotInfo &Info = Slot2Info[Candidates[K]];
       unsigned Offset = 0;
-      for (unsigned Pt : Info.Liveness.set_bits())
+      for (unsigned Pt : Info.Liveness.set_bits()) {
         Offset = std::max(Offset, FindOffset(Info, Pt));
 
+        // If Offset == WorseCaseOffset, this is always a valid, options. so no
+        // more checking needed
+        // If Offset > BestOffset, we already found a better solution, so this
+        // one doesn't matter
+        if (Offset == WorseCaseOffset || Offset > BestOffset)
+          break;
+      }
+
       Offset = alignTo(Offset, Info.Align);
 
-      int64_t Score = (int64_t)Offset - (int64_t)Log2(Info.Align);
-      LLVM_DEBUG(dbgs() << "SlotInfo(" << Candidates[K] << ") Score=" << Score << "\n");
+      LLVM_DEBUG(dbgs() << "choice: SlotInfo(" << Candidates[K] << ") at " << Offset << "\n");
       bool IsBetter = [&] {
-        if (BestScore != Score)
-          return BestScore > Score;
+        if (BestOffset != Offset)
+          return BestOffset > Offset;
         SlotInfo &Other = Slot2Info[Candidates[K]];
+        if (Other.Align != Info.Align)
+          return Other.Align < Info.Align;
         if (Other.Size != Info.Size)
           return Other.Size < Info.Size;
         if (Other.SlotPriority != Info.SlotPriority)
@@ -1672,7 +1694,6 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
       }();
 
       if (IsBetter) {
-        BestScore = Score;
         BestIdx = K;
         BestOffset = Offset;
       }
@@ -1681,11 +1702,24 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
 
     LLVM_DEBUG(Info.dump(this));
     LLVM_DEBUG(dbgs() << "Placing SlotInfo(" << Candidates[BestIdx] << ") at "
-                      << BestOffset << " Score=" << BestScore << "\n");
+                      << BestOffset << "\n");
 
     Info.Offset = BestOffset;
+    WorseCaseOffset = std::max(WorseCaseOffset, BestOffset + Info.Size);
     for (unsigned Pt : Info.Liveness.set_bits())
       UpdateOffset(Info, Pt, BestOffset + Info.Size);
+#ifdef EXPENSIVE_CHECKS
+    // Validate the order of offsets in the linked-list
+    for (Status &S : LatestStatus) {
+      Status *Curr = &S;
+      unsigned CurrOffset = Curr->Offset;
+      while (Curr->Prev != InvalidIdx) {
+        assert(Curr->Offset <= CurrOffset);
+        CurrOffset = Curr->Offset;
+        Curr = &OlderStatus[Curr->Prev];
+      }
+    }
+#endif
 
     std::swap(Candidates[BestIdx], Candidates.back());
     Candidates.pop_back();
@@ -1788,7 +1822,6 @@ bool StackColoring::run(MachineFunction &Func) {
 
   // Propagate the liveness information.
   calculateLiveIntervals(NumSlots);
-  LLVM_DEBUG(dumpIntervals());
 
   // Search for allocas which are used outside of the declared lifetime
   // markers.
@@ -1796,6 +1829,7 @@ bool StackColoring::run(MachineFunction &Func) {
     removeInvalidSlotRanges();
 
   if (!UseNewStackColoring) {
+    LLVM_DEBUG(dumpIntervals());
     // Maps old slots to new slots.
     DenseMap<int, int> SlotRemap;
     unsigned RemovedSlots = 0;

>From 74054b5ff57110349d0531c9044335656dd74e4a Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Tue, 10 Jun 2025 22:15:54 +0200
Subject: [PATCH 15/19] [NFC] Make StackColoring debug mode more concise

---
 llvm/lib/CodeGen/StackColoring.cpp | 62 ++++++++++++++++++------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index b9a3b5b3bc8c8..b12ef12df38b6 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -652,19 +652,23 @@ LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
 
 LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State) const {
   unsigned Slot = InvalidIdx;
-  if (State)
+  if (State) {
     Slot = this - State->Slot2Info.data();
+    dbgs() << "fi#" << Slot;
+  } else
   dbgs() << "SlotInfo"; 
-  if (State)
-    dbgs() << "(" << Slot << ")";
-  dbgs()<< ": ";
-  dbgs() << '\n';
-  if (State)
-    if (State->MFI->getObjectAllocation(Slot)) {
-      State->MFI->getObjectAllocation(Slot)->print(dbgs());
-      dbgs() << '\n';
+  dbgs() << ":";
+  if (Offset != InvalidIdx)
+    dbgs() << " offset=" << Offset;
+  if (State) {
+    if (State->MFI->getObjectAllocation(Slot))
+      dbgs() << " \"" << State->MFI->getObjectAllocation(Slot)->getName() << "\"";
+    if (State->MFI->isSpillSlotObjectIndex(Slot))
+      dbgs() << " spill";
     }
-  dbgs() << "Size=" << Size << " Align=" << Align.value() << '\n';
+  dbgs() << " size=" << Size << " align=" << Align.value() << '\n';
+  if (IndexBasedLiveRange)
+    dbgs() << "Index: " << *IndexBasedLiveRange << "\n";
   dumpBV("LIVENESS   ", Liveness);
   BitVector Start;
   Start.resize(Liveness.size());
@@ -1607,7 +1611,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   LatestStatus.resize(LivenessSize, Status{});
   SmallVector<Status> OlderStatus;
 
-  auto FindOffset = [&](SlotInfo &Info, unsigned Pt) {
+  auto FindStatus = [&](SlotInfo &Info, unsigned Pt) -> Status& {
     Status *Last = &LatestStatus[Pt];
 
     // The slots in the linked-list are always kept in ascending order, so the
@@ -1617,9 +1621,9 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     while (LLVM_UNLIKELY(Last->Slot != InvalidIdx &&
                          !HasOverlapCached(Info, Slot2Info[Last->Slot])))
       Last = &OlderStatus[Last->Prev];
-    return Last->Offset;
+    return *Last;
   };
-  auto UpdateOffset = [&](SlotInfo &Info, unsigned Pt, unsigned Offset) {
+  auto UpdateStatus = [&](SlotInfo &Info, unsigned Pt, unsigned Offset) {
     Status* Last = &LatestStatus[Pt];
     unsigned Idx = OlderStatus.size();
     OlderStatus.push_back(*Last);
@@ -1656,16 +1660,25 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     Candidates.push_back(SlotStack.pop_back_val());
   }
 
+  LLVM_DEBUG(dbgs() << "\nStarting Placement:\n");
   unsigned WorseCaseOffset = 0;
   while (!Candidates.empty()) {
     unsigned BestIdx = InvalidIdx;
     unsigned BestOffset = InvalidIdx;
 
+    LLVM_DEBUG(dbgs() << "top=" << WorseCaseOffset << " choosing: ");
     for (unsigned K = 0; K < Candidates.size(); K++) {
       SlotInfo &Info = Slot2Info[Candidates[K]];
       unsigned Offset = 0;
+      unsigned PrevSlot = InvalidIdx;
+      (void)PrevSlot; // Only use in LLVM_DEBUG
+
       for (unsigned Pt : Info.Liveness.set_bits()) {
-        Offset = std::max(Offset, FindOffset(Info, Pt));
+        Status S = FindStatus(Info, Pt);
+        if (S.Offset > Offset) {
+          PrevSlot = S.Slot;
+          Offset = S.Offset;
+        }
 
         // If Offset == WorseCaseOffset, this is always a valid, options. so no
         // more checking needed
@@ -1677,7 +1690,10 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
 
       Offset = alignTo(Offset, Info.Align);
 
-      LLVM_DEBUG(dbgs() << "choice: SlotInfo(" << Candidates[K] << ") at " << Offset << "\n");
+      LLVM_DEBUG(dbgs() << "fi#" << Candidates[K] << "@" << Offset << "->";
+                 if (PrevSlot == InvalidIdx) dbgs() << "bottom";
+                 else dbgs() << "fi#" << PrevSlot; dbgs() << ", ";);
+
       bool IsBetter = [&] {
         if (BestOffset != Offset)
           return BestOffset > Offset;
@@ -1699,15 +1715,15 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
       }
     }
     SlotInfo &Info = Slot2Info[Candidates[BestIdx]];
+    Info.Offset = BestOffset;
+    WorseCaseOffset = std::max(WorseCaseOffset, BestOffset + Info.Size);
 
+    LLVM_DEBUG(dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "Placing: ");
     LLVM_DEBUG(Info.dump(this));
-    LLVM_DEBUG(dbgs() << "Placing SlotInfo(" << Candidates[BestIdx] << ") at "
-                      << BestOffset << "\n");
 
-    Info.Offset = BestOffset;
-    WorseCaseOffset = std::max(WorseCaseOffset, BestOffset + Info.Size);
     for (unsigned Pt : Info.Liveness.set_bits())
-      UpdateOffset(Info, Pt, BestOffset + Info.Size);
+      UpdateStatus(Info, Pt, BestOffset + Info.Size);
 #ifdef EXPENSIVE_CHECKS
     // Validate the order of offsets in the linked-list
     for (Status &S : LatestStatus) {
@@ -1786,13 +1802,9 @@ bool StackColoring::run(MachineFunction &Func) {
   unsigned TotalSize = 0;
   LLVM_DEBUG(dbgs() << "Found " << NumMarkers << " markers and " << NumSlots
                     << " slots\n");
-  LLVM_DEBUG(dbgs() << "Slot structure:\n");
 
-  for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
-    LLVM_DEBUG(dbgs() << "Slot #" << i << " - " << MFI->getObjectSize(i)
-                      << " bytes.\n");
+  for (int i=0; i < MFI->getObjectIndexEnd(); ++i)
     TotalSize += MFI->getObjectSize(i);
-  }
 
   LLVM_DEBUG(dbgs() << "Total Stack size: " << TotalSize << " bytes\n\n");
 

>From 709b55ae6ae14f2203a4b3df93b92198867ae1d8 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Wed, 11 Jun 2025 23:59:11 +0200
Subject: [PATCH 16/19] [NFC] Cleanup + comments

---
 llvm/lib/CodeGen/StackColoring.cpp | 89 ++++++++++++------------------
 1 file changed, 34 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index b12ef12df38b6..f00b5a17f8b91 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -10,13 +10,7 @@
 // lifetime markers machine instructions (LIFETIME_START and LIFETIME_END),
 // which represent the possible lifetime of stack slots. It attempts to
 // merge disjoint stack slots and reduce the used stack space.
-// NOTE: This pass is not StackSlotColoring, which optimizes spill slots.
-//
-// TODO: In the future we plan to improve stack coloring in the following ways:
-// 1. Allow merging multiple small slots into a single larger slot at different
-//    offsets.
-// 2. Merge this pass with StackSlotColoring and allow merging of allocas with
-//    spill slots.
+// NOTE: This pass is not StackSlotColoring, which optimizes only spill slots.
 //
 //===----------------------------------------------------------------------===//
 
@@ -25,7 +19,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -100,9 +93,8 @@ static cl::opt<bool> UseNewStackColoring(
     "new-stack-coloring", cl::init(false), cl::Hidden,
     cl::desc("Use a better logic to try to reduce stack usage"));
 
-static constexpr unsigned MaxCandidatesToConsiderDefault = 5;
-static cl::opt<unsigned> MaxCandidatesToConsider(
-    "stackcoloring-max-candidates", cl::init(MaxCandidatesToConsiderDefault),
+static cl::opt<unsigned> MaxCandidatesOpt(
+    "stackcoloring-max-candidates", cl::init(0),
     cl::Hidden,
     cl::desc(
         "Max number of candidates that will be evaluated, 0 means no limit"));
@@ -656,7 +648,7 @@ LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State)
     Slot = this - State->Slot2Info.data();
     dbgs() << "fi#" << Slot;
   } else
-  dbgs() << "SlotInfo"; 
+    dbgs() << "SlotInfo";
   dbgs() << ":";
   if (Offset != InvalidIdx)
     dbgs() << " offset=" << Offset;
@@ -665,10 +657,8 @@ LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State)
       dbgs() << " \"" << State->MFI->getObjectAllocation(Slot)->getName() << "\"";
     if (State->MFI->isSpillSlotObjectIndex(Slot))
       dbgs() << " spill";
-    }
+  }
   dbgs() << " size=" << Size << " align=" << Align.value() << '\n';
-  if (IndexBasedLiveRange)
-    dbgs() << "Index: " << *IndexBasedLiveRange << "\n";
   dumpBV("LIVENESS   ", Liveness);
   BitVector Start;
   Start.resize(Liveness.size());
@@ -973,6 +963,13 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
   unsigned SpillChangeCounter = 0;
 
   if (LS && LS->getNumIntervals()) {
+    // Here we prepare Spill slots lifetime informations
+    // Live ranges in the LiveStacks seem to be slightly outdated in many small
+    // ways. this is not an issue for stack-slot-coloring, because its only
+    // operating on LiveRange form LiveStack, but it is an issue here,
+    // So we only rely on LiveStack, to give us live edges, and conservatively
+    // re-construct in-block liveness changes
+
     for (const MachineBasicBlock &MBB : *MF) {
       BlockLifetimeInfo &MBBLiveness = BlockLiveness[MBB.getNumber()];
       MBBLiveness.LiveIn.resize(NumSlots);
@@ -1015,14 +1012,6 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
           unsigned Slot = PSV->getFrameIndex();
           if (!LS->hasInterval(Slot))
             continue;
-          // if (Slot == 17) {
-          //   dbgs() << "MI: " << MI;
-          //   dbgs() << "MBB: " << MBB.getName() << "\n";
-          //   dbgs() << "MBB range:" << Indexes->getMBBRange(&MBB).first << "-"
-          //          << Indexes->getMBBRange(&MBB).second << "\n";
-          //   dbgs() << "slot range: " << LS->getInterval(Slot) << "\n";
-          //   dbgs() << "\n";
-          // }
           assert(MMO->isStore() != MMO->isLoad());
           if (MMO->isStore()) {
             if (!IsStoredTo[Slot]) {
@@ -1048,6 +1037,8 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
                                           /*IsStart=*/false, Slot});
       }
 
+      // Ensure that the changes are in the same order they will be found and
+      // need to be processed in
       std::stable_sort(MidBlockSpillChanges.begin() + SizeOnStart,
                        MidBlockSpillChanges.end(),
                        [&](SplitSlotChanges Lhs, SplitSlotChanges Rhs) -> bool {
@@ -1081,6 +1072,8 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
 
     bool StartedSinceInc = false;
     auto EndRangeFor = [&](int Slot) {
+      // The less index the better, so we only increase if the ranges would not
+      // be accurate without
       if (StartIdx[Slot] == CurrIdx || StartedSinceInc) {
         CurrIdx++;
         StartedSinceInc = false;
@@ -1101,9 +1094,6 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
         continue;
       SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
       auto OnChange = [&](unsigned Slot, bool IsStart) {
-        // if (Slot == 3) {
-        //   outs() << "HERE\n";
-        // }
         if (IsStart) {
           StartedSinceInc = true;
           // If a slot is already definitely in use, we don't have to emit
@@ -1209,7 +1199,6 @@ void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedS
       continue;
     int Slot = VI.getStackSlot();
     if (Slot >= 0 && Slot2Info[Slot].Offset != InvalidIdx) {
-      // FIXME: properly update the offset into MergedSlot debug
       VI.updateStackSlot(MergedSlot);
     }
     if (auto It = SlotRemap.find(Slot); It != SlotRemap.end()) {
@@ -1585,7 +1574,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   SlotInfo* LastQueryLhs = nullptr;
   SlotInfo* LastQueryRhs = nullptr;
   bool LastQueryRes = false;
-  // TODO: Real caching ?
+  // Maybe there should be real caching here
   auto HasOverlapCached = [&](SlotInfo &Lhs, SlotInfo &Rhs) {
     if (&Lhs == LastQueryLhs && LastQueryRhs == &Rhs)
       return LastQueryRes;
@@ -1616,8 +1605,10 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
 
     // The slots in the linked-list are always kept in ascending order, so the
     // earliest slot has the lowest offset
-    // This loop handles cases where the latest slot doesn't cannot be both live
-    // because of the CFG, so even if there lifetime overlap, they can overlap
+    // This loop handles cases where this slot and the latest slot doesn't
+    // cannot be both live because of the CFG, so even if there lifetime
+    // overlap, they can overlap
+    // See comment about implementation higher in the file
     while (LLVM_UNLIKELY(Last->Slot != InvalidIdx &&
                          !HasOverlapCached(Info, Slot2Info[Last->Slot])))
       Last = &OlderStatus[Last->Prev];
@@ -1638,7 +1629,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
       return;
     }
 
-    // Insure ordering of slots
+    // Ensure ordering of slots
     Status* Inserted = &OlderStatus.back();
     Inserted->Offset = Offset;
     Inserted->Slot = &Info - Slot2Info.data();
@@ -1651,9 +1642,10 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     Curr->Prev = Idx;
   };
 
-  SmallVector<unsigned, MaxCandidatesToConsiderDefault> Candidates;
-  unsigned MaxCandidates =
-      MaxCandidatesToConsider == 0 ? ~0u : MaxCandidatesToConsider;
+  // This is a vector but element ordering is not relevant
+  SmallVector<unsigned> Candidates;
+
+  unsigned MaxCandidates = MaxCandidatesOpt == 0 ? ~0u : MaxCandidatesOpt;
   for (unsigned I = 0; I < MaxCandidates; I++) {
     if (SlotStack.empty())
       break;
@@ -1666,7 +1658,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     unsigned BestIdx = InvalidIdx;
     unsigned BestOffset = InvalidIdx;
 
-    LLVM_DEBUG(dbgs() << "top=" << WorseCaseOffset << " choosing: ");
+    LLVM_DEBUG(dbgs() << "Worse is at " << WorseCaseOffset << ", choosing: ");
     for (unsigned K = 0; K < Candidates.size(); K++) {
       SlotInfo &Info = Slot2Info[Candidates[K]];
       unsigned Offset = 0;
@@ -1705,7 +1697,8 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
         if (Other.SlotPriority != Info.SlotPriority)
           return Other.SlotPriority < Info.SlotPriority;
 
-        // Both are always stored in Slot2Info, so this is deterministic
+        // Both are always stored in Slot2Info, so this is equivalent to
+        // FrameIndex comparaison
         return &Other < &Info;
       }();
 
@@ -1783,10 +1776,10 @@ bool StackColoring::run(MachineFunction &Func) {
   VNInfoAllocator.Reset();
   Slot2Info.clear();
 
-  unsigned NumSlots = MFI->getObjectIndexEnd();
+  if (!UseNewStackColoring)
+    LS = nullptr;
 
-  // if (MF->getName() == "_ZL9transformPjS_Rm")
-  //   outs() << "HERE\n";
+  unsigned NumSlots = MFI->getObjectIndexEnd();
 
   // If there are no stack slots then there are no markers to remove.
   if (NumSlots < 2 || DisableColoring)
@@ -1898,22 +1891,10 @@ bool StackColoring::run(MachineFunction &Func) {
           auto &SecondS = LiveStarts[SecondSlot];
           assert(!First->empty() && !Second->empty() && "Found an empty range");
 
-          bool OldNoOverlap = !First->isLiveAtIndexes(SecondS) &&
-                              !Second->isLiveAtIndexes(FirstS);
-
-          SlotInfo &FSlot = Slot2Info[FirstSlot];
-          SlotInfo &SSlot = Slot2Info[SecondSlot];
-          bool NewNoOverlap = !FSlot.hasOverlap(SSlot);
-
-          // if (NewNoOverlap != OldNoOverlap) {
-          //   LLVM_DEBUG(dbgs() << "OldNoOverlap=" << OldNoOverlap
-          //                     << " NewNoOverlap=" << NewNoOverlap << "\n");
-          // }
-          // assert(OldNoOverlap == NewNoOverlap);
-
           // Merge disjoint slots. This is a little bit tricky - see the
           // Implementation Notes section for an explanation.
-          if (OldNoOverlap) {
+          if (!First->isLiveAtIndexes(SecondS) &&
+              !Second->isLiveAtIndexes(FirstS)) {
             Changed = true;
             First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
 
@@ -1922,8 +1903,6 @@ bool StackColoring::run(MachineFunction &Func) {
             auto Mid = FirstS.begin() + OldSize;
             std::inplace_merge(FirstS.begin(), Mid, FirstS.end());
 
-            // FSlot.Liveness |= SSlot.Liveness;
-
             SlotRemap[SecondSlot] = FirstSlot;
             SortedSlots[J] = -1;
             LLVM_DEBUG(dbgs() << "Merging #" << FirstSlot << " and slots #"

>From 0472ecd8ba4e672542853d74f390decba47b3dd8 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 12 Jun 2025 00:27:07 +0200
Subject: [PATCH 17/19] Cleanup the Diff

---
 llvm/lib/CodeGen/StackColoring.cpp | 36 ++++++++----------------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index f00b5a17f8b91..9fdf5c426201b 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -529,7 +529,7 @@ class StackColoring {
 
   /// Go over the machine function and change instructions which use stack
   /// slots to use the joint slots.
-  void remapInstructions(DenseMap<int, int> &SlotRemap, int MergedSlot);
+  void remapInstructions(DenseMap<int, int> &SlotRemap);
 
   /// The input program may contain instructions which are not inside lifetime
   /// markers. This can happen due to a bug in the compiler or due to a bug in
@@ -1188,7 +1188,7 @@ bool StackColoring::removeAllMarkers() {
   return Count;
 }
 
-void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedSlot) {
+void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   unsigned FixedInstr = 0;
   unsigned FixedMemOp = 0;
   unsigned FixedDbg = 0;
@@ -1198,9 +1198,6 @@ void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedS
     if (!VI.Var || !VI.inStackSlot())
       continue;
     int Slot = VI.getStackSlot();
-    if (Slot >= 0 && Slot2Info[Slot].Offset != InvalidIdx) {
-      VI.updateStackSlot(MergedSlot);
-    }
     if (auto It = SlotRemap.find(Slot); It != SlotRemap.end()) {
       LLVM_DEBUG(dbgs() << "Remapping debug info for ["
                         << cast<DILocalVariable>(VI.Var)->getName() << "].\n");
@@ -1309,12 +1306,6 @@ void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedS
         if (FromSlot<0)
           continue;
 
-        if (FromSlot >= 0 && Slot2Info[FromSlot].Offset != InvalidIdx) {
-          MO.setIndex(MergedSlot);
-          MO.setOffset(MO.getOffset() + Slot2Info[FromSlot].Offset);
-          continue;
-        }
-
         // Only look at mapped slots.
         if (!SlotRemap.count(FromSlot))
           continue;
@@ -1356,8 +1347,6 @@ void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedS
           auto To = SlotRemap.find(FI);
           if (To != SlotRemap.end())
             SSRefs[FI].push_back(MMO);
-          if (FI >= 0 && Slot2Info[FI].Offset != InvalidIdx)
-            SSRefs[FI].push_back(MMO);
         }
 
         // If this memory location can be a slot remapped here,
@@ -1376,7 +1365,7 @@ void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedS
                 // that is not remapped, we continue checking.
                 // Otherwise, we need to invalidate AA infomation.
                 const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V);
-                if ((AI && MergedAllocas.count(AI)) || UseNewStackColoring) {
+                if (AI && MergedAllocas.count(AI)) {
                   MayHaveConflictingAAMD = true;
                   break;
                 }
@@ -1400,20 +1389,13 @@ void StackColoring::remapInstructions(DenseMap<int, int>& SlotRemap, int MergedS
   // Rewrite MachineMemOperands that reference old frame indices.
   for (auto E : enumerate(SSRefs))
     if (!E.value().empty()) {
-      if (UseNewStackColoring) {
-        const PseudoSourceValue *NewSV =
-            MF->getPSVManager().getFixedStack(MergedSlot);
-        for (MachineMemOperand *Ref : E.value())
-          Ref->setValue(NewSV);
-      } else {
-        const PseudoSourceValue *NewSV = MF->getPSVManager().getFixedStack(
-            SlotRemap.find(E.index())->second);
-        for (MachineMemOperand *Ref : E.value())
-          Ref->setValue(NewSV);
-      }
+      const PseudoSourceValue *NewSV =
+          MF->getPSVManager().getFixedStack(SlotRemap.find(E.index())->second);
+      for (MachineMemOperand *Ref : E.value())
+        Ref->setValue(NewSV);
     }
 
-  // Update the location of C++ catch objects for the MSVC personality routine.
+    // Update the location of C++ catch objects for the MSVC personality routine.
   if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo())
     for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap)
       for (WinEHHandlerType &H : TBME.HandlerArray)
@@ -1933,7 +1915,7 @@ bool StackColoring::run(MachineFunction &Func) {
     // indices to use the remapped frame index.
     if (!SlotRemap.empty()) {
       expungeSlotMap(SlotRemap, NumSlots);
-      remapInstructions(SlotRemap, InvalidIdx);
+      remapInstructions(SlotRemap);
     }
   } else {
     // Maybe this entire logic should be moved to a generic StackLayouter that

>From df2fb92fe3f6e79f0fd604001b88ddff5be563d8 Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 12 Jun 2025 00:27:36 +0200
Subject: [PATCH 18/19] format StackColoring.cpp

---
 llvm/lib/CodeGen/StackColoring.cpp | 126 +++++++++++++++--------------
 1 file changed, 64 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 9fdf5c426201b..9319401424f3f 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -49,8 +49,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <limits>
@@ -64,10 +64,9 @@ using namespace llvm;
 DEBUG_COUNTER(ProcessSlot, DEBUG_TYPE "-slot",
               "Controls which slot get processed");
 
-static cl::opt<bool>
-DisableColoring("no-stack-coloring",
-        cl::init(false), cl::Hidden,
-        cl::desc("Disable stack coloring"));
+static cl::opt<bool> DisableColoring("no-stack-coloring", cl::init(false),
+                                     cl::Hidden,
+                                     cl::desc("Disable stack coloring"));
 
 /// The user may write code that uses allocas outside of the declared lifetime
 /// zone. This can happen when the user returns a reference to a local
@@ -75,31 +74,31 @@ DisableColoring("no-stack-coloring",
 /// code. If this flag is enabled, we try to save the user. This option
 /// is treated as overriding LifetimeStartOnFirstUse below.
 static cl::opt<bool>
-ProtectFromEscapedAllocas("protect-from-escaped-allocas",
-                          cl::init(false), cl::Hidden,
-                          cl::desc("Do not optimize lifetime zones that "
-                                   "are broken"));
+    ProtectFromEscapedAllocas("protect-from-escaped-allocas", cl::init(false),
+                              cl::Hidden,
+                              cl::desc("Do not optimize lifetime zones that "
+                                       "are broken"));
 
 /// Enable enhanced dataflow scheme for lifetime analysis (treat first
 /// use of stack slot as start of slot lifetime, as opposed to looking
 /// for LIFETIME_START marker). See "Implementation notes" below for
 /// more info.
 static cl::opt<bool>
-LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use",
-        cl::init(true), cl::Hidden,
-        cl::desc("Treat stack lifetimes as starting on first use, not on START marker."));
+    LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use",
+                            cl::init(true), cl::Hidden,
+                            cl::desc("Treat stack lifetimes as starting on "
+                                     "first use, not on START marker."));
 
 static cl::opt<bool> UseNewStackColoring(
     "new-stack-coloring", cl::init(false), cl::Hidden,
     cl::desc("Use a better logic to try to reduce stack usage"));
 
 static cl::opt<unsigned> MaxCandidatesOpt(
-    "stackcoloring-max-candidates", cl::init(0),
-    cl::Hidden,
+    "stackcoloring-max-candidates", cl::init(0), cl::Hidden,
     cl::desc(
         "Max number of candidates that will be evaluated, 0 means no limit"));
 
-STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
+STATISTIC(NumMarkerSeen, "Number of lifetime markers found.");
 STATISTIC(GeneratedWorse, "Number of times worse layout were generated");
 STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
 STATISTIC(StackSlotMerged, "Number of stack slot merged.");
@@ -392,7 +391,7 @@ class StackColoring {
   MachineFrameInfo *MFI = nullptr;
   MachineFunction *MF = nullptr;
 
-  LiveStacks* LS = nullptr;
+  LiveStacks *LS = nullptr;
 
   struct SlotInfo {
     // All places in the current function where this Slot is live
@@ -420,7 +419,7 @@ class StackColoring {
                     [&](int Idx) { return Liveness[Idx]; });
     }
 
-    LLVM_DUMP_METHOD void dump(const StackColoring* State = nullptr) const;
+    LLVM_DUMP_METHOD void dump(const StackColoring *State = nullptr) const;
   };
 
   /// A class representing liveness information for a single basic block.
@@ -465,7 +464,7 @@ class StackColoring {
 
   /// The list of lifetime markers found. These markers are to be removed
   /// once the coloring is done.
-  SmallVector<MachineInstr*, 8> Markers;
+  SmallVector<MachineInstr *, 8> Markers;
 
   /// Record the FI slots for which we have seen some sort of
   /// lifetime marker (either start or end).
@@ -479,7 +478,8 @@ class StackColoring {
   unsigned NumIterations;
 
 public:
-  StackColoring(SlotIndexes *Indexes, LiveStacks* LS) : LS(LS), Indexes(Indexes) {}
+  StackColoring(SlotIndexes *Indexes, LiveStacks *LS)
+      : LS(LS), Indexes(Indexes) {}
   bool run(MachineFunction &Func);
 
 private:
@@ -506,7 +506,8 @@ class StackColoring {
   unsigned doMerging(unsigned NumSlots);
 
   /// Returns TRUE if we're using the first-use-begins-lifetime method for
-  /// this slot (if FALSE, then the start marker is treated as start of lifetime).
+  /// this slot (if FALSE, then the start marker is treated as start of
+  /// lifetime).
   bool applyFirstUse(int Slot) {
     if (!LifetimeStartOnFirstUse || ProtectFromEscapedAllocas)
       return false;
@@ -520,8 +521,7 @@ class StackColoring {
   /// starting or ending are added to the vector "slots" and "isStart" is set
   /// accordingly.
   /// \returns True if inst contains a lifetime start or end
-  bool isLifetimeStartOrEnd(const MachineInstr &MI,
-                            SmallVector<int, 4> &slots,
+  bool isLifetimeStartOrEnd(const MachineInstr &MI, SmallVector<int, 4> &slots,
                             bool &isStart);
 
   /// Construct the LiveIntervals for the slots.
@@ -623,8 +623,8 @@ LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
 
 LLVM_DUMP_METHOD void StackColoring::dump() const {
   for (MachineBasicBlock *MBB : depth_first(MF)) {
-    dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
-           << MBB->getName() << "]\n";
+    dbgs() << "Inspecting block #" << MBB->getNumber() << " [" << MBB->getName()
+           << "]\n";
     dumpBB(MBB);
   }
 }
@@ -642,7 +642,8 @@ LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
   }
 }
 
-LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State) const {
+LLVM_DUMP_METHOD void
+StackColoring::SlotInfo::dump(const StackColoring *State) const {
   unsigned Slot = InvalidIdx;
   if (State) {
     Slot = this - State->Slot2Info.data();
@@ -654,7 +655,8 @@ LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State)
     dbgs() << " offset=" << Offset;
   if (State) {
     if (State->MFI->getObjectAllocation(Slot))
-      dbgs() << " \"" << State->MFI->getObjectAllocation(Slot)->getName() << "\"";
+      dbgs() << " \"" << State->MFI->getObjectAllocation(Slot)->getName()
+             << "\"";
     if (State->MFI->isSpillSlotObjectIndex(Slot))
       dbgs() << " spill";
   }
@@ -673,8 +675,7 @@ LLVM_DUMP_METHOD void StackColoring::SlotInfo::dump(const StackColoring* State)
 
 #endif
 
-static inline int getStartOrEndSlot(const MachineInstr &MI)
-{
+static inline int getStartOrEndSlot(const MachineInstr &MI) {
   assert((MI.getOpcode() == TargetOpcode::LIFETIME_START ||
           MI.getOpcode() == TargetOpcode::LIFETIME_END) &&
          "Expected LIFETIME_START or LIFETIME_END op");
@@ -715,7 +716,7 @@ bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI,
         if (!MO.isFI())
           continue;
         int Slot = MO.getIndex();
-        if (Slot<0)
+        if (Slot < 0)
           continue;
         if (InterestingSlots.test(Slot) && applyFirstUse(Slot)) {
           slots.push_back(Slot);
@@ -802,7 +803,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
           int Slot = MO.getIndex();
           if (Slot < 0)
             continue;
-          if (! BetweenStartEnd.test(Slot)) {
+          if (!BetweenStartEnd.test(Slot)) {
             ConservativeSlots.set(Slot);
           }
         }
@@ -954,7 +955,7 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
 
   DefinitelyInUse.resize(NumSlots);
   struct SplitSlotChanges {
-    const MachineInstr* AtMI;
+    const MachineInstr *AtMI;
     unsigned BlockIdx : 31;
     unsigned IsStart : 1;
     unsigned Slot;
@@ -995,7 +996,7 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       BitVector IsStoredTo;
       IsStoredTo.resize(NumSlots, false);
       struct MIBlockIdx {
-        const MachineInstr* MI;
+        const MachineInstr *MI;
         unsigned BlockIdx;
       };
       unsigned BlockIdx = 0;
@@ -1004,7 +1005,7 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       for (const MachineInstr &MI : MBB) {
         if (MI.isDebugInstr())
           continue;
-        for (MachineMemOperand* MMO : MI.memoperands()) {
+        for (MachineMemOperand *MMO : MI.memoperands()) {
           auto *PSV = dyn_cast_if_present<FixedStackPseudoSourceValue>(
               MMO->getPseudoValue());
           if (!PSV)
@@ -1207,10 +1208,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   }
 
   // Keep a list of *allocas* which need to be remapped.
-  DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
+  DenseMap<const AllocaInst *, const AllocaInst *> Allocas;
 
   // Keep a list of allocas which has been affected by the remap.
-  SmallPtrSet<const AllocaInst*, 32> MergedAllocas;
+  SmallPtrSet<const AllocaInst *, 32> MergedAllocas;
 
   for (const std::pair<int, int> &SI : SlotRemap) {
     const AllocaInst *From = MFI->getObjectAllocation(SI.first);
@@ -1244,8 +1245,8 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     // Transfer the stack protector layout tag, but make sure that SSPLK_AddrOf
     // does not overwrite SSPLK_SmallArray or SSPLK_LargeArray, and make sure
     // that SSPLK_SmallArray does not overwrite SSPLK_LargeArray.
-    MachineFrameInfo::SSPLayoutKind FromKind
-        = MFI->getObjectSSPLayout(SI.first);
+    MachineFrameInfo::SSPLayoutKind FromKind =
+        MFI->getObjectSSPLayout(SI.first);
     MachineFrameInfo::SSPLayoutKind ToKind = MFI->getObjectSSPLayout(SI.second);
     if (FromKind != MachineFrameInfo::SSPLK_None &&
         (ToKind == MachineFrameInfo::SSPLK_None ||
@@ -1303,20 +1304,20 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         int FromSlot = MO.getIndex();
 
         // Don't touch arguments.
-        if (FromSlot<0)
+        if (FromSlot < 0)
           continue;
 
         // Only look at mapped slots.
         if (!SlotRemap.count(FromSlot))
           continue;
 
-        // In a debug build, check that the instruction that we are modifying is
-        // inside the expected live range. If the instruction is not inside
-        // the calculated range then it means that the alloca usage moved
-        // outside of the lifetime markers, or that the user has a bug.
-        // NOTE: Alloca address calculations which happen outside the lifetime
-        // zone are okay, despite the fact that we don't have a good way
-        // for validating all of the usages of the calculation.
+      // In a debug build, check that the instruction that we are modifying is
+      // inside the expected live range. If the instruction is not inside
+      // the calculated range then it means that the alloca usage moved
+      // outside of the lifetime markers, or that the user has a bug.
+      // NOTE: Alloca address calculations which happen outside the lifetime
+      // zone are okay, despite the fact that we don't have a good way
+      // for validating all of the usages of the calculation.
 #ifndef NDEBUG
         bool TouchesMemory = I.mayLoadOrStore();
         // If we *don't* protect the user from escaped allocas, don't bother
@@ -1395,7 +1396,7 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         Ref->setValue(NewSV);
     }
 
-    // Update the location of C++ catch objects for the MSVC personality routine.
+  // Update the location of C++ catch objects for the MSVC personality routine.
   if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo())
     for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap)
       for (WinEHHandlerType &H : TBME.HandlerArray)
@@ -1407,9 +1408,9 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   LLVM_DEBUG(dbgs() << "Fixed " << FixedMemOp << " machine memory operands.\n");
   LLVM_DEBUG(dbgs() << "Fixed " << FixedDbg << " debug locations.\n");
   LLVM_DEBUG(dbgs() << "Fixed " << FixedInstr << " machine instructions.\n");
-  (void) FixedMemOp;
-  (void) FixedDbg;
-  (void) FixedInstr;
+  (void)FixedMemOp;
+  (void)FixedDbg;
+  (void)FixedInstr;
 }
 
 void StackColoring::removeInvalidSlotRanges() {
@@ -1435,7 +1436,7 @@ void StackColoring::removeInvalidSlotRanges() {
 
         int Slot = MO.getIndex();
 
-        if (Slot<0)
+        if (Slot < 0)
           continue;
 
         if (Intervals[Slot]->empty())
@@ -1457,7 +1458,7 @@ void StackColoring::removeInvalidSlotRanges() {
 void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
                                    unsigned NumSlots) {
   // Expunge slot remap map.
-  for (unsigned i=0; i < NumSlots; ++i) {
+  for (unsigned i = 0; i < NumSlots; ++i) {
     // If we are remapping i
     if (auto It = SlotRemap.find(i); It != SlotRemap.end()) {
       int Target = It->second;
@@ -1477,8 +1478,9 @@ bool StackColoringLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  LiveStacks* LS = nullptr;
-  LiveStacksWrapperLegacy* LSWL = getAnalysisIfAvailable<LiveStacksWrapperLegacy>();
+  LiveStacks *LS = nullptr;
+  LiveStacksWrapperLegacy *LSWL =
+      getAnalysisIfAvailable<LiveStacksWrapperLegacy>();
   if (LSWL)
     LS = &LSWL->getLS();
 
@@ -1502,7 +1504,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   int64_t OrigOptSize = 0;
   int64_t OrigPesSize = 0;
   for (unsigned Slot = 0; Slot < NumSlots; Slot++) {
-    SlotInfo& Info = Slot2Info[Slot];
+    SlotInfo &Info = Slot2Info[Slot];
     if (Info.StartLiveness.empty())
       assert(!LS || !LS->hasInterval(Slot));
     if (!Info.StartLiveness.empty() &&
@@ -1553,8 +1555,8 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
         });
   }
 
-  SlotInfo* LastQueryLhs = nullptr;
-  SlotInfo* LastQueryRhs = nullptr;
+  SlotInfo *LastQueryLhs = nullptr;
+  SlotInfo *LastQueryRhs = nullptr;
   bool LastQueryRes = false;
   // Maybe there should be real caching here
   auto HasOverlapCached = [&](SlotInfo &Lhs, SlotInfo &Rhs) {
@@ -1582,7 +1584,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   LatestStatus.resize(LivenessSize, Status{});
   SmallVector<Status> OlderStatus;
 
-  auto FindStatus = [&](SlotInfo &Info, unsigned Pt) -> Status& {
+  auto FindStatus = [&](SlotInfo &Info, unsigned Pt) -> Status & {
     Status *Last = &LatestStatus[Pt];
 
     // The slots in the linked-list are always kept in ascending order, so the
@@ -1597,7 +1599,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     return *Last;
   };
   auto UpdateStatus = [&](SlotInfo &Info, unsigned Pt, unsigned Offset) {
-    Status* Last = &LatestStatus[Pt];
+    Status *Last = &LatestStatus[Pt];
     unsigned Idx = OlderStatus.size();
     OlderStatus.push_back(*Last);
 
@@ -1612,7 +1614,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
     }
 
     // Ensure ordering of slots
-    Status* Inserted = &OlderStatus.back();
+    Status *Inserted = &OlderStatus.back();
     Inserted->Offset = Offset;
     Inserted->Slot = &Info - Slot2Info.data();
     Status *Curr = Last;
@@ -1719,7 +1721,7 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   }
 
   unsigned FinalSize = 0;
-  for (Status& U : LatestStatus)
+  for (Status &U : LatestStatus)
     FinalSize = std::max(FinalSize, U.Offset);
   LLVM_DEBUG(dbgs() << "MergedSize=" << FinalSize << " OrigPesSize="
                     << OrigPesSize << " OrigOptSize" << OrigOptSize << "\n");
@@ -1778,7 +1780,7 @@ bool StackColoring::run(MachineFunction &Func) {
   LLVM_DEBUG(dbgs() << "Found " << NumMarkers << " markers and " << NumSlots
                     << " slots\n");
 
-  for (int i=0; i < MFI->getObjectIndexEnd(); ++i)
+  for (int i = 0; i < MFI->getObjectIndexEnd(); ++i)
     TotalSize += MFI->getObjectSize(i);
 
   LLVM_DEBUG(dbgs() << "Total Stack size: " << TotalSize << " bytes\n\n");
@@ -1791,7 +1793,7 @@ bool StackColoring::run(MachineFunction &Func) {
   }
 
   Slot2Info.resize(NumSlots);
-  for (unsigned i=0; i < NumSlots; ++i) {
+  for (unsigned i = 0; i < NumSlots; ++i) {
     std::unique_ptr<LiveRange> LI(new LiveRange());
     LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
     Intervals.push_back(std::move(LI));

>From f7ae304acbe9adcfe84115b65d5992f8b52c0cab Mon Sep 17 00:00:00 2001
From: tyker <tyker1 at outlook.com>
Date: Thu, 19 Jun 2025 20:06:44 +0200
Subject: [PATCH 19/19] Update selection heristics to avoid code-size
 regression in average

---
 llvm/include/llvm/CodeGen/MachineFrameInfo.h |  2 +-
 llvm/lib/CodeGen/StackColoring.cpp           | 84 ++++++++++----------
 2 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index fdb2fbd133397..5c05b792cd1e0 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -772,7 +772,7 @@ class MachineFrameInfo {
     // If ID == 0, MaxAlignment will need to be updated separately.
   }
 
-  int getUnderlyingSlot(int ObjectIdx) {
+  int getUnderlyingSlot(int ObjectIdx) const {
     assert(unsigned(ObjectIdx + NumFixedObjects) < Objects.size() &&
            "Invalid Object Idx!");
     return Objects[ObjectIdx + NumFixedObjects].UnderlyingSlot;
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 9319401424f3f..798eef9354256 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/PseudoSourceValueManager.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
@@ -99,7 +100,6 @@ static cl::opt<unsigned> MaxCandidatesOpt(
         "Max number of candidates that will be evaluated, 0 means no limit"));
 
 STATISTIC(NumMarkerSeen, "Number of lifetime markers found.");
-STATISTIC(GeneratedWorse, "Number of times worse layout were generated");
 STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
 STATISTIC(StackSlotMerged, "Number of stack slot merged.");
 STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
@@ -400,7 +400,9 @@ class StackColoring {
     // Use to make overlap queries faster
     SmallVector<unsigned, 4> StartLiveness;
 
-    uint64_t SlotPriority = 0;
+    int64_t SlotPriority = 0;
+
+    unsigned UseCount = 0;
 
     unsigned Offset = InvalidIdx;
 
@@ -653,9 +655,11 @@ StackColoring::SlotInfo::dump(const StackColoring *State) const {
   dbgs() << ":";
   if (Offset != InvalidIdx)
     dbgs() << " offset=" << Offset;
+  dbgs() << " uses=" << UseCount;
+  dbgs() << " prio=" << SlotPriority;
   if (State) {
     if (State->MFI->getObjectAllocation(Slot))
-      dbgs() << " \"" << State->MFI->getObjectAllocation(Slot)->getName()
+      dbgs() << " alloca=\"" << State->MFI->getObjectAllocation(Slot)->getName()
              << "\"";
     if (State->MFI->isSpillSlotObjectIndex(Slot))
       dbgs() << " spill";
@@ -803,6 +807,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
           int Slot = MO.getIndex();
           if (Slot < 0)
             continue;
+          Slot2Info[Slot].UseCount++;
           if (!BetweenStartEnd.test(Slot)) {
             ConservativeSlots.set(Slot);
           }
@@ -1525,35 +1530,24 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   if (SlotStack.size() <= 1)
     return InvalidIdx;
 
-  // This Whole block is only used to try and order the stack, such that the
-  // Slots are processed in an order that helps getting good packing
-  {
-    // Find how much usage of every livepoint there is.
-    SmallVector<unsigned> CumulatedUsage;
-    CumulatedUsage.resize(LivenessSize, 0);
-
-    for (unsigned Idx = 0; Idx < SlotStack.size(); Idx++) {
-      SlotInfo &Info = Slot2Info[SlotStack[Idx]];
-      for (unsigned Pt : Info.Liveness.set_bits()) {
-        CumulatedUsage[Pt] += Info.Size;
-      }
-    }
+  // This logic is optimized for x86_64, it probably needs to be adapted to
+  // other targets to get good code-size/stack-size balance.
+  // Its inspired from X86FrameLowering::orderFrameObjects, but modified weight
+  // in alignments helping with stack size
+  auto IsLower = [&](unsigned Lhs, unsigned Rhs) {
+    SlotInfo &L = Slot2Info[Lhs];
+    SlotInfo &R = Slot2Info[Rhs];
+    uint64_t DensityLScaled = static_cast<uint64_t>(L.UseCount) *
+                              static_cast<uint64_t>(R.Size + Log2(R.Align));
+    uint64_t DensityRScaled = static_cast<uint64_t>(R.UseCount) *
+                              static_cast<uint64_t>(L.Size + Log2(L.Align));
+    return DensityLScaled < DensityRScaled;
+  };
+  std::stable_sort(SlotStack.begin(), SlotStack.end(), IsLower);
 
-    for (unsigned Idx = 0; Idx < SlotStack.size(); Idx++) {
-      SlotInfo &Info = Slot2Info[SlotStack[Idx]];
-      for (unsigned Pt : Info.Liveness.set_bits()) {
-        // Since the goal is to minimize the max usage, blocks that are in high
-        // contention areas are given more priority
-        Info.SlotPriority +=
-            (uint64_t)CumulatedUsage[Pt] * (uint64_t)CumulatedUsage[Pt] +
-            (uint64_t)Info.Size * (uint64_t)Info.Align.value();
-      }
-    }
-    std::stable_sort(
-        SlotStack.begin(), SlotStack.end(), [&](unsigned Lhs, unsigned Rhs) {
-          return Slot2Info[Lhs].SlotPriority < Slot2Info[Rhs].SlotPriority;
-        });
-  }
+  int Prio = 0;
+  for (int Slot : SlotStack)
+    Slot2Info[Slot].SlotPriority = Prio++;
 
   SlotInfo *LastQueryLhs = nullptr;
   SlotInfo *LastQueryRhs = nullptr;
@@ -1666,24 +1660,27 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
 
       Offset = alignTo(Offset, Info.Align);
 
-      LLVM_DEBUG(dbgs() << "fi#" << Candidates[K] << "@" << Offset << "->";
-                 if (PrevSlot == InvalidIdx) dbgs() << "bottom";
-                 else dbgs() << "fi#" << PrevSlot; dbgs() << ", ";);
+      LLVM_DEBUG({
+        dbgs() << "fi#" << Candidates[K] << "@" << Offset;
+        if (PrevSlot != InvalidIdx)
+          dbgs() << "->" << "fi#" << PrevSlot;
+        dbgs() << ", ";
+      });
 
       bool IsBetter = [&] {
+        if (BestIdx == InvalidIdx)
+          return true;
+        SlotInfo &Best = Slot2Info[Candidates[BestIdx]];
         if (BestOffset != Offset)
           return BestOffset > Offset;
-        SlotInfo &Other = Slot2Info[Candidates[K]];
-        if (Other.Align != Info.Align)
-          return Other.Align < Info.Align;
-        if (Other.Size != Info.Size)
-          return Other.Size < Info.Size;
-        if (Other.SlotPriority != Info.SlotPriority)
-          return Other.SlotPriority < Info.SlotPriority;
+        if (Best.SlotPriority != Info.SlotPriority)
+          return Best.SlotPriority < Info.SlotPriority;
+        if (Best.Align != Info.Align)
+          return Best.Align < Info.Align;
 
         // Both are always stored in Slot2Info, so this is equivalent to
         // FrameIndex comparaison
-        return &Other < &Info;
+        return &Best < &Info;
       }();
 
       if (IsBetter) {
@@ -1726,7 +1723,6 @@ unsigned StackColoring::doMerging(unsigned NumSlots) {
   LLVM_DEBUG(dbgs() << "MergedSize=" << FinalSize << " OrigPesSize="
                     << OrigPesSize << " OrigOptSize" << OrigOptSize << "\n");
   if (FinalSize >= OrigPesSize) {
-    GeneratedWorse++;
     return InvalidIdx;
   }
 
@@ -1774,6 +1770,7 @@ bool StackColoring::run(MachineFunction &Func) {
   Intervals.reserve(NumSlots);
   LiveStarts.resize(NumSlots);
 
+  Slot2Info.resize(NumSlots);
   unsigned NumMarkers = collectMarkers(NumSlots);
 
   unsigned TotalSize = 0;
@@ -1792,7 +1789,6 @@ bool StackColoring::run(MachineFunction &Func) {
     return removeAllMarkers();
   }
 
-  Slot2Info.resize(NumSlots);
   for (unsigned i = 0; i < NumSlots; ++i) {
     std::unique_ptr<LiveRange> LI(new LiveRange());
     LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);



More information about the llvm-commits mailing list