[llvm] [AMDGPU] Prioritize allocation of low 256 VGPR classes (PR #167978)

Wed Nov 19 10:27:14 PST 2025

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/167978

>From 71dee8b6e0802b1d98ff56dbb95a4335caecf423 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 13 Nov 2025 15:24:52 -0800
Subject: [PATCH] [AMDGPU] Prioritize allocation of low 256 VGPR classes

If we have 1024 VGPRs available we need to give priority to the
allocation of these registers where operands can only use low 256.
That is noteably scale operands of V_WMMA_SCALE instructions.
Otherwise large tuples will be allocated first and take all low
registers, so we would have to spill to get a room for these
scale registers.

Allocation priority itself does not eliminate spilling completely
in large kernels, although helps to some degree. Increasing spill
weight of a restricted class on top of it helps.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h               | 11 +++++++++++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td              |  2 +-
 llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll |  5 ++---
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 1402291539ff8..bb8a80f811d4c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -496,6 +496,17 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 
   SmallVector<StringLiteral>
   getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+  float
+  getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
+    // Prioritize VGPR_32_Lo256 over other classes which may occupy registers
+    // beyond v256.
+    return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
+           ((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
+             RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
+                ? 2.0
+                : 1.0);
+  }
 };
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index abe12c17ae76c..5cff5f2248b02 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
 // Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
 def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                                     (add (sequence "VGPR%u", 0, 255))> {
-  let AllocationPriority = 0;
+  let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
   let GeneratePressureSet = 0;
   let Size = 32;
   let Weight = 1;
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll b/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll
index 1ac3da3b930f9..eafe54ebc98f8 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll
@@ -1,9 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s
 
-; FIXME: Scale operands of WMMA are limited to low 256 VGPRs
-;        currently we are spilling it because all low VGPRs are occupied even though our budget is higher.
+; Scale operands of WMMA are limited to low 256 VGPRs
 ; Make sure we do not spill scale operands because of the low 256 restriction.
-; CHECK: ; ScratchSize: 12
+; CHECK: ; ScratchSize: 0
 ; CHECK: ; Occupancy: 1
 
 define amdgpu_kernel void @spill_scale_test(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <16 x i32> %arg8, float %arg9, <16 x i32> %arg10, float %arg11, <16 x i8> %arg12) #0 {