[llvm] [StackSlotColoring] Add size aware stack slot coloring with TTI hook (PR #181874)

Wed Feb 18 07:16:48 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Akash Dutta (akadutta)

<details>
<summary>Changes</summary>

Stack slot coloring is extended with an optional, target-controlled size-aware mode: ordering can take slot size into account (via a TTI scale), and allocation can use best-fit instead of first-fit. 

**Changes**
**TTI**: New hook getStackSlotColoringSizeWeightScale() (default 0.0f). When the scale is positive, the pass uses that value for size-aware ordering and enables best-fit allocation; when 0, behavior is unchanged (weight-only ordering, first-fit). For AMDGPU the hook has been overwritten to return 0.01, enabling size-aware ordering and best-fit for this target only.
**Ordering**: When scale > 0, intervals are sorted by weight + scale * slot_size (heaviest/largest first) instead of weight alone. LiveInterval weights are not modified; the combination is used only as the sort key so merged weights remain use-based.
**Allocation**: When scale > 0, the pass picks the existing slot that minimizes max(existing_size, required_size) (best-fit) instead of the first non-overlapping slot (first-fit).


Potential benefits
* Lower stack/scratch usage on targets that enable the scale (e.g. AMDGPU), by packing slots more effectively and reusing space instead of extending the first fit. For example, in the new test (test/CodeGen/AMDGPU/stack-slot-coloring-size-aware.ll) this reduces scratch memory usage from 280 to 256.
* Better packing when weights are similar by processing larger slots earlier so best-fit can reuse space more often.



Co-authored with Cursor AI

---

Patch is 579.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/181874.diff


9 Files Affected:

- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+5) 
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2) 
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4) 
- (modified) llvm/lib/CodeGen/StackSlotColoring.cpp (+58-11) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+2) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+1742-1743) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+64-64) 
- (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+136-136) 
- (added) llvm/test/CodeGen/AMDGPU/stack-slot-coloring-size-aware.ll (+515) 


``````````diff

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b06ee091827f7..7db4d6584d724 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -873,6 +873,11 @@ class TargetTransformInfo {
                            LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
                            TargetLibraryInfo *LibInfo) const;
 
+  /// Return a positive scale to incorporate stack slot size into coloring
+  /// weight (weight += scale * size). When 0, only use-based weight is used;
+  /// when positive, the target also uses best-fit allocation.
+  LLVM_ABI float getStackSlotColoringSizeWeightScale() const;
+
   /// Which addressing mode Loop Strength Reduction will try to generate.
   enum AddressingModeKind {
     AMK_None = 0x0,        ///< Don't prefer any addressing mode
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 5ef18fecabd99..bd9f0a58bb3dd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -345,6 +345,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  virtual float getStackSlotColoringSizeWeightScale() const { return 0.0f; }
+
   virtual TTI::AddressingModeKind
   getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const {
     return TTI::AMK_None;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 504fa9b448ec0..9fbecf2b62b06 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -470,6 +470,10 @@ bool TargetTransformInfo::canSaveCmp(Loop *L, BranchInst **BI,
   return TTIImpl->canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo);
 }
 
+float TargetTransformInfo::getStackSlotColoringSizeWeightScale() const {
+  return TTIImpl->getStackSlotColoringSizeWeightScale();
+}
+
 TTI::AddressingModeKind
 TargetTransformInfo::getPreferredAddressingMode(const Loop *L,
                                                 ScalarEvolution *SE) const {
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index dfe613fb42ad6..9eff0b0430dd5 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
@@ -40,9 +41,11 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
+#include <limits>
 #include <vector>
 
 using namespace llvm;
@@ -140,6 +143,9 @@ class StackSlotColoring {
   // Assignments - Color to intervals mapping.
   SmallVector<ColorAssignmentInfo, 16> Assignments;
 
+  // Scale to add (scale * slot size) to weight; when > 0, also use best-fit.
+  float SizeWeightScale;
+
 public:
   StackSlotColoring(MachineFunction &MF, LiveStacks *LS,
                     MachineBlockFrequencyInfo *MBFI, SlotIndexes *Indexes)
@@ -281,7 +287,7 @@ void StackSlotColoring::InitializeSlots() {
 
     SSIntervals.push_back(&li);
     OrigAlignments[FI] = MFI->getObjectAlign(FI);
-    OrigSizes[FI]      = MFI->getObjectSize(FI);
+    OrigSizes[FI] = MFI->getObjectSize(FI);
 
     auto StackID = MFI->getStackID(FI);
     if (StackID != 0) {
@@ -297,8 +303,16 @@ void StackSlotColoring::InitializeSlots() {
   }
   LLVM_DEBUG(dbgs() << '\n');
 
-  // Sort them by weight.
-  llvm::stable_sort(SSIntervals, IntervalSorter());
+  // Sort by weight (heaviest first). When TTI provides a scale, use weight +
+  // scale*size as the sort key so larger slots are ordered earlier when weights
+  // are similar.
+  llvm::stable_sort(SSIntervals, [this](LiveInterval *LHS, LiveInterval *RHS) {
+    float L = LHS->weight() +
+              SizeWeightScale * OrigSizes[LHS->reg().stackSlotIndex()];
+    float R = RHS->weight() +
+              SizeWeightScale * OrigSizes[RHS->reg().stackSlotIndex()];
+    return L > R;
+  });
 
   NextColors.resize(AllColors.size());
 
@@ -315,16 +329,45 @@ int StackSlotColoring::ColorSlot(LiveInterval *li) {
   uint8_t StackID = MFI->getStackID(FI);
 
   if (!DisableSharing) {
+    int64_t RequiredSize = OrigSizes[FI];
+
+    if (SizeWeightScale > 0.0f) {
+      // Best-fit: choose the color that minimizes final size after sharing.
+      int BestColor = -1;
+      int64_t BestFinalSize = std::numeric_limits<int64_t>::max();
+
+      Color = UsedColors[StackID].find_first();
+      while (Color != -1) {
+        if (!Assignments[Color].overlaps(li)) {
+          int64_t ColorSize = MFI->getObjectSize(Color);
+          int64_t FinalSize = std::max(ColorSize, RequiredSize);
+          if (FinalSize < BestFinalSize) {
+            BestColor = Color;
+            BestFinalSize = FinalSize;
+          }
+        }
+        Color = UsedColors[StackID].find_next(Color);
+      }
 
-    // Check if it's possible to reuse any of the used colors.
-    Color = UsedColors[StackID].find_first();
-    while (Color != -1) {
-      if (!Assignments[Color].overlaps(li)) {
+      if (BestColor != -1) {
+        Color = BestColor;
         Share = true;
         ++NumEliminated;
-        break;
+        LLVM_DEBUG(dbgs() << "  Best-fit: fi#" << FI
+                          << " (size=" << RequiredSize << ") -> fi#" << Color
+                          << " (final=" << BestFinalSize << ")\n");
+      }
+    } else {
+      // First-fit: use the first non-overlapping color.
+      Color = UsedColors[StackID].find_first();
+      while (Color != -1) {
+        if (!Assignments[Color].overlaps(li)) {
+          Share = true;
+          ++NumEliminated;
+          break;
+        }
+        Color = UsedColors[StackID].find_next(Color);
       }
-      Color = UsedColors[StackID].find_next(Color);
     }
   }
 
@@ -333,8 +376,8 @@ int StackSlotColoring::ColorSlot(LiveInterval *li) {
     Share = false;
   }
 
-  // Assign it to the first available color (assumed to be the best) if it's
-  // not possible to share a used color with other objects.
+  // Assign it to a new color if it's not possible to share a used color
+  // with other objects.
   if (!Share) {
     assert(NextColors[StackID] != -1 && "No more spill slots?");
     Color = NextColors[StackID];
@@ -541,6 +584,10 @@ bool StackSlotColoring::run(MachineFunction &MF) {
   if (MF.exposesReturnsTwice())
     return false;
 
+  SizeWeightScale = MF.getTarget()
+                        .getTargetTransformInfo(MF.getFunction())
+                        .getStackSlotColoringSizeWeightScale();
+
   // Gather spill slot references
   ScanForSpillSlotRefs(MF);
   InitializeSlots();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 3ec157aacd0aa..b8406b8195a73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -114,6 +114,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   bool hasBranchDivergence(const Function *F = nullptr) const override;
 
+  float getStackSlotColoringSizeWeightScale() const override { return 0.01f; }
+
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
                                OptimizationRemarkEmitter *ORE) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 1b1f7fcadc540..ae13a4cefed7a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -25858,18 +25858,18 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s11
 ; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s9
 ; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s7
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -25888,13 +25888,13 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_lshr_b64 v[3:4], v[56:57], 16
 ; SI-NEXT:    v_mov_b32_e32 v4, v41
 ; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v25
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[4:5], v[41:42], 16
 ; SI-NEXT:    v_mov_b32_e32 v5, v6
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v45
 ; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v52
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v43
@@ -25916,57 +25916,57 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], 16
 ; SI-NEXT:    v_mov_b32_e32 v6, v40
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[6:7], v[40:41], 16
 ; SI-NEXT:    v_mov_b32_e32 v7, v8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[7:8], v[8:9], 16
 ; SI-NEXT:    v_mov_b32_e32 v8, v46
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[8:9], v[46:47], 16
 ; SI-NEXT:    v_mov_b32_e32 v9, v10
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[9:10], v[10:11], 16
 ; SI-NEXT:    v_mov_b32_e32 v10, v44
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[10:11], v[44:45], 16
 ; SI-NEXT:    v_mov_b32_e32 v11, v12
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v45, v50
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[11:12], v[12:13], 16
 ; SI-NEXT:    v_mov_b32_e32 v12, v36
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[12:13], v[36:37], 16
 ; SI-NEXT:    v_mov_b32_e32 v13, v14
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v39
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[14:15], 16
 ; SI-NEXT:    v_mov_b32_e32 v14, v35
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[14:15], v[35:36], 16
 ; SI-NEXT:    v_mov_b32_e32 v15, v16
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[15:16], v[16:17], 16
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
@@ -25982,8 +25982,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
 ; SI-NEXT:    v_mov_b32_e32 v17, v18
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[17:18], v[18:19], 16
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
@@ -25996,20 +25996,20 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
 ; SI-NEXT:    v_mov_b32_e32 v19, v20
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[19:20], v[20:21], 16
 ; SI-NEXT:    v_mov_b32_e32 v20, v51
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[51:52], 16
 ; SI-NEXT:    v_mov_b32_e32 v21, v22
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[21:22], v[22:23], 16
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
@@ -26076,34 +26076,34 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[53:54], 16
 ; SI-NEXT:    s_branch .LBB19_3
 ; SI-NEXT:  .LBB19_2:
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/181874