[llvm] [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (PR #130047)

Diana Picus via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 19 01:06:39 PDT 2025


https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/130047

>From 3cacd079d8671473213fd273a8df6839723728a2 Mon Sep 17 00:00:00 2001
From: Jannik Silvanus <jannik.silvanus at amd.com>
Date: Thu, 30 Mar 2023 17:45:38 +0200
Subject: [PATCH 1/4] [AMDGPU] Add GFX12 S_ALLOC_VGPR instruction

This patch only adds the instruction for disassembly support.

We neither have an instrinsic nor codegen support, and it is
unclear whether we actually want to ever have an intrinsic,
given the fragile semantics.

For now, it will be generated only by the backend in very specific
circumstances.
---
 llvm/lib/Target/AMDGPU/SOPInstructions.td            | 7 +++++++
 llvm/test/MC/AMDGPU/gfx11_unsupported.s              | 3 +++
 llvm/test/MC/AMDGPU/gfx12_asm_sop1.s                 | 9 +++++++++
 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt | 9 +++++++++
 4 files changed, 28 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 5e62ceac281b8..eeac9c1ad1084 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -423,6 +423,12 @@ let SubtargetPredicate = isGFX11Plus in {
   }
 } // End SubtargetPredicate = isGFX11Plus
 
+let SubtargetPredicate = isGFX12Plus in {
+  let hasSideEffects = 1, Defs = [SCC] in {
+    def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
+  }
+} // End SubtargetPredicate = isGFX12Plus
+
 class SOP1_F32_Inst<string opName, SDPatternOperator Op, ValueType vt0=f32,
                     ValueType vt1=vt0> :
   SOP1_32<opName, [(set vt0:$sdst, (UniformUnaryFrag<Op> vt1:$src0))]>;
@@ -2048,6 +2054,7 @@ defm S_GET_BARRIER_STATE_M0       : SOP1_M0_Real_gfx12<0x050>;
 defm S_BARRIER_SIGNAL_IMM         : SOP1_IMM_Real_gfx12<0x04e>;
 defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
 defm S_GET_BARRIER_STATE_IMM      : SOP1_IMM_Real_gfx12<0x050>;
+defm S_ALLOC_VGPR                 : SOP1_Real_gfx12<0x053>;
 defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12<0x058>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index d5d10ce548c4d..cd30647fe6378 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -322,6 +322,9 @@ image_sample_cd_o v252, v[1:4], s[8:15], s[12:15] dmask:0x1
 image_sample_cd_o_g16 v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+s_alloc_vgpr s0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 s_atomic_add flat_scratch_hi, s[2:3], s0
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index 1e12e5bb48828..3828a4d36adcc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -1,5 +1,14 @@
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
 
+s_alloc_vgpr 0x1235
+// GFX12: encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
+
+s_alloc_vgpr 18
+// GFX12: encoding: [0x92,0x53,0x80,0xbe]
+
+s_alloc_vgpr s35
+// GFX12: encoding: [0x23,0x53,0x80,0xbe]
+
 s_sleep_var 0x1234
 // GFX12: encoding: [0xff,0x58,0x80,0xbe,0x34,0x12,0x00,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index fa7d020bdd726..2ee84fa976d79 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -1,5 +1,14 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12 %s
 
+# GFX12: s_alloc_vgpr 0x1235                          ; encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
+0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00
+
+# GFX12: s_alloc_vgpr 18                              ; encoding: [0x92,0x53,0x80,0xbe]
+0x92,0x53,0x80,0xbe
+
+# GFX12: s_alloc_vgpr s35                             ; encoding: [0x23,0x53,0x80,0xbe]
+0x23,0x53,0x80,0xbe
+
 # GFX12: s_sleep_var 0x1234                   ; encoding: [0xff,0x58,0x80,0xbe,0x34,0x12,0x00,0x00]
 0xff,0x58,0x80,0xbe,0x34,0x12,0x00,0x00
 

>From b2a7bdc3954d2bf72e99d730ce00159c2550f563 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Mon, 23 Oct 2023 10:36:31 +0200
Subject: [PATCH 2/4] [AMDGPU] Add SubtargetFeature for dynamic VGPR mode

This represents a hardware mode supported only for wave32 compute
shaders. When enabled, we set the `.dynamic_vgpr_en` field of
`.compute_registers` to true in the PAL metadata.
---
 llvm/docs/AMDGPUUsage.rst                    |  6 ++++++
 llvm/lib/Target/AMDGPU/AMDGPU.td             |  6 ++++++
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp  |  3 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h        |  3 +++
 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll | 13 ++++++++-----
 5 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index def6addd595e8..59cc08a59ed7c 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -758,6 +758,12 @@ For example:
                                                   enabled will execute correctly but may be less
                                                   performant than code generated for XNACK replay
                                                   disabled.
+
+     dynamic-vgpr    TODO                         Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
+                                                  Waves launched in this mode may allocate or deallocate the VGPRs
+                                                  using dedicated instructions, but may not send the DEALLOC_VGPRS
+                                                  message.
+
      =============== ============================ ==================================================
 
 .. _amdgpu-target-id:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index effc8d2ed6b49..31a98ee132bf6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1239,6 +1239,12 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
    "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
  >;
 
+def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
+  "DynamicVGPR",
+  "true",
+  "Enable dynamic VGPR mode"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 31e0bd8d652bc..13e61756e3036 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1414,6 +1414,9 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
     MD->setHwStage(CC, ".trap_present",
                    (bool)CurrentProgramInfo.TrapHandlerEnable);
     MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
+
+    if (ST.isDynamicVGPREnabled())
+      MD->setComputeRegisters(".dynamic_vgpr_en", true);
   }
 
   MD->setHwStage(CC, ".lds_size",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 6664a70572ded..1254cbad83b60 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -190,6 +190,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// indicates a lack of S_CLAUSE support.
   unsigned MaxHardClauseLength = 0;
   bool SupportsSRAMECC = false;
+  bool DynamicVGPR = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for SRAMECC.
@@ -1647,6 +1648,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return true;
   }
 
+  bool isDynamicVGPREnabled() const { return DynamicVGPR; }
+
   bool requiresDisjointEarlyClobberAndUndef() const override {
     // AMDGPU doesn't care if early-clobber and undef operands are allocated
     // to the same register.
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 7536e83a9da6b..fa22089978c2e 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -1,4 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR
 
 ; CHECK-LABEL: {{^}}_amdgpu_cs_main:
 ; CHECK: ; TotalNumSgprs: 4
@@ -8,6 +10,7 @@
 ; CHECK-NEXT: amdpal.pipelines:
 ; CHECK-NEXT:   - .api:            Vulkan
 ; CHECK-NEXT:     .compute_registers:
+; DVGPR-NEXT:       .dynamic_vgpr_en:   true
 ; CHECK-NEXT:       .tg_size_en:     true
 ; CHECK-NEXT:       .tgid_x_en:      false
 ; CHECK-NEXT:       .tgid_y_en:      false
@@ -57,7 +60,7 @@
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
-; CHECK-NEXT:        .ieee_mode:      false
+; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
@@ -112,7 +115,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_gs
 ; CHECK-NEXT:        .entry_point_symbol:    gs_shader
-; CHECK-NEXT:        .ieee_mode:      false
+; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x200
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
@@ -124,7 +127,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_hs
 ; CHECK-NEXT:        .entry_point_symbol:    hs_shader
-; CHECK-NEXT:        .ieee_mode:      false
+; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x1000
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
@@ -136,7 +139,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_ps
 ; CHECK-NEXT:        .entry_point_symbol:    ps_shader
-; CHECK-NEXT:        .ieee_mode:      false
+; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false

>From c29d8202c06488a9466aea49dda4cf2b4663236e Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Mon, 23 Oct 2023 11:46:19 +0200
Subject: [PATCH 3/4] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR
 mode

In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If
the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0`
before S_ENDPGM, but this may incur some performance cost. Therefore
it's better if the compiler proactively generates that instruction.

This patch extends `si-insert-waitcnts` to deallocate the VGPRs via
a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  60 +--
 .../CodeGen/AMDGPU/release-vgprs-gfx12.mir    | 356 ++++++++++++++++++
 2 files changed, 393 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7e6bce2bf5f12..42ef23e836a58 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
   }
-  // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
-  // stores. In this case it can be useful to send a message to explicitly
-  // release all VGPRs before the stores have completed, but it is only safe to
-  // do this if:
-  // * there are no outstanding scratch stores
-  // * we are not in Dynamic VGPR mode
+  // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
+  // Technically the hardware will do this on its own if we don't, but that
+  // might cost extra cycles compared to doing it explicitly.
+  // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
+  // have to wait for outstanding VMEM stores. In this case it can be useful to
+  // send a message to explicitly release all VGPRs before the stores have
+  // completed, but it is only safe to do this if there are no outstanding
+  // scratch stores.
   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
-    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
-        ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
-        !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
+    if (!WCG->isOptNone() &&
+        (ST->isDynamicVGPREnabled() ||
+         (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
+          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
       ReleaseVGPRInsts.insert(&MI);
   }
   // Resolve vm waits before gs-done.
@@ -2610,26 +2614,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
-  // instructions.
+  // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
+  // This is done in different ways depending on how the VGPRs were allocated
+  // (i.e. whether we're in dynamic VGPR mode or not).
   // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
   // waveslot limited kernel runs slower with the deallocation.
-  if (!ReleaseVGPRInsts.empty() &&
-      (MF.getFrameInfo().hasCalls() ||
-       ST->getOccupancyWithNumVGPRs(
-           TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
-           AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+  if (ST->isDynamicVGPREnabled()) {
     for (MachineInstr *MI : ReleaseVGPRInsts) {
-      if (ST->requiresNopBeforeDeallocVGPRs()) {
-        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                TII->get(AMDGPU::S_NOP))
-            .addImm(0);
-      }
       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-              TII->get(AMDGPU::S_SENDMSG))
-          .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+              TII->get(AMDGPU::S_ALLOC_VGPR))
+          .addImm(0);
       Modified = true;
     }
+  } else {
+    if (!ReleaseVGPRInsts.empty() &&
+        (MF.getFrameInfo().hasCalls() ||
+         ST->getOccupancyWithNumVGPRs(
+             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
+             AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+      for (MachineInstr *MI : ReleaseVGPRInsts) {
+        if (ST->requiresNopBeforeDeallocVGPRs()) {
+          BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                  TII->get(AMDGPU::S_NOP))
+              .addImm(0);
+        }
+        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                TII->get(AMDGPU::S_SENDMSG))
+            .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+        Modified = true;
+      }
+    }
   }
   ReleaseVGPRInsts.clear();
   PreheadersToFlush.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
new file mode 100644
index 0000000000000..884b5f8b6f018
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@@ -0,0 +1,356 @@
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
+
+--- |
+  define amdgpu_ps void @tbuffer_store1() { ret void }
+  define amdgpu_ps void @tbuffer_store2() { ret void }
+  define amdgpu_ps void @flat_store() { ret void }
+  define amdgpu_ps void @global_store() { ret void }
+  define amdgpu_ps void @buffer_store_format() { ret void }
+  define amdgpu_ps void @ds_write_b32() { ret void }
+  define amdgpu_ps void @global_store_dword() { ret void }
+  define amdgpu_ps void @multiple_basic_blocks1() { ret void }
+  define amdgpu_ps void @multiple_basic_blocks2() { ret void }
+  define amdgpu_ps void @multiple_basic_blocks3() { ret void }
+  define amdgpu_ps void @recursive_loop() { ret void }
+  define amdgpu_ps void @recursive_loop_vmem() { ret void }
+  define amdgpu_ps void @image_store() { ret void }
+  define amdgpu_ps void @scratch_store() { ret void }
+  define amdgpu_ps void @buffer_atomic() { ret void }
+  define amdgpu_ps void @flat_atomic() { ret void }
+  define amdgpu_ps void @global_atomic() { ret void }
+  define amdgpu_ps void @image_atomic() { ret void }
+  define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+...
+
+---
+name:            tbuffer_store1
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: tbuffer_store1
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            tbuffer_store2
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: tbuffer_store2
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            flat_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: flat_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+---
+name:            global_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAIT_STORECNT 0
+    S_ENDPGM 0
+...
+
+---
+name:            buffer_store_format
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: buffer_store_format
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            ds_write_b32
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: ds_write_b32
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr1 = IMPLICIT_DEF
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
+    S_ENDPGM 0
+
+...
+---
+name:            global_store_dword
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: global_store_dword
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            multiple_basic_blocks1
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks1
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name:            multiple_basic_blocks2
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks2
+  ; CHECK: bb.2:
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.2
+
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            multiple_basic_blocks3
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks3
+  ; CHECK: bb.4:
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.4
+
+    S_BRANCH %bb.4
+
+  bb.3:
+    successors: %bb.4
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_ENDPGM 0
+...
+
+---
+name:            recursive_loop
+body:             |
+  ; CHECK-LABEL: name: recursive_loop
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            recursive_loop_vmem
+body:             |
+  ; CHECK-LABEL: name: recursive_loop_vmem
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            image_store
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: image_store
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
+  S_ENDPGM 0
+...
+
+---
+name:            scratch_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: scratch_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
+    SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+---
+name:            buffer_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: buffer_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            flat_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: flat_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+
+---
+name:            global_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            image_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: image_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            global_store_optnone
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_store_optnone
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK-NOT: S_ALLOC_VGPR
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAIT_STORECNT 0
+    S_ENDPGM 0
+...

>From d3ceb4ebd008183980b97d0887f9b2bdb30b7c6f Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Mon, 5 Feb 2024 13:48:16 +0100
Subject: [PATCH 4/4] [AMDGPU] Update target helpers & GCNSchedStrategy for
 dynamic VGPRs

In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32
VGPRs (based on a chip-wide setting which we can model with a Subtarget
feature). Update some of the subtarget helpers to reflect this.

In particular:
- getVGPRAllocGranule is set to the block size
- getAddresableNumVGPR will limit itself to 8 * size of a block

We also try to be more careful about how many VGPR blocks we allocate.
Therefore, when deciding if we should revert scheduling after a given
stage, we check that we haven't increased the number of VGPR blocks that
need to be allocated.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  6 ++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 10 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  1 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  6 ++
 .../Target/AMDGPU/AMDGPUUnitTests.cpp         | 62 +++++++++++++++++++
 llvm/unittests/Target/AMDGPU/CMakeLists.txt   |  1 +
 6 files changed, 86 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 31a98ee132bf6..339eeec72da46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1245,6 +1245,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
   "Enable dynamic VGPR mode"
 >;
 
+def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
+  "DynamicVGPRBlockSize32",
+  "true",
+  "Use a block size of 32 for dynamic VGPR allocation (default is 16)"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c277223de13ac..4cc71f321f8f2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (WavesAfter < DAG.MinOccupancy)
     return true;
 
+  // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
+  if (ST.isDynamicVGPREnabled()) {
+    unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+        &ST, PressureBefore.getVGPRNum(false));
+    unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+        &ST, PressureAfter.getVGPRNum(false));
+    if (BlocksAfter > BlocksBefore)
+      return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 1254cbad83b60..9ccf38fb4dbbe 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -191,6 +191,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   unsigned MaxHardClauseLength = 0;
   bool SupportsSRAMECC = false;
   bool DynamicVGPR = false;
+  bool DynamicVGPRBlockSize32 = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
   // for SRAMECC.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b51cf536467b9..bebbb0dde0b9b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1154,6 +1154,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 8;
 
+  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+    return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
+
   bool IsWave32 = EnableWavefrontSize32 ?
       *EnableWavefrontSize32 :
       STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1195,6 +1198,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 512;
+  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+    // On GFX12 we can allocate at most 8 blocks of VGPRs.
+    return 8 * getVGPRAllocGranule(STI);
   return getAddressableNumArchVGPRs(STI);
 }
 
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
index 8fbd470815b79..21f45443281e7 100644
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -152,6 +152,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
   EXPECT_TRUE(ErrStr.empty()) << ErrStr;
 }
 
+static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
+                                  TestFuncTy test) {
+  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
+                                      "+dynamic-vgpr," + FS.str());
+  ASSERT_TRUE(TM) << "No target machine";
+
+  GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+                  std::string(TM->getTargetFeatureString()), *TM);
+  ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
+
+  std::stringstream Table;
+  bool Success = testAndRecord(Table, ST, test);
+  EXPECT_TRUE(Success && !PrintCpuRegLimits)
+      << CPUName << " dynamic VGPR " << FS
+      << ":\nOcc    MinVGPR        MaxVGPR\n"
+      << Table.str() << '\n';
+}
+
 TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
   auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
     unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
@@ -163,6 +181,50 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
   };
 
   testGPRLimits("VGPR", true, test);
+
+  testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
+  testDynamicVGPRLimits("gfx1200",
+                        "+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
+}
+
+static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
+                               unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
+                               unsigned ExpectedMaxVGPRs) {
+  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
+  ASSERT_TRUE(TM) << "No target machine";
+
+  GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+                  std::string(TM->getTargetFeatureString()), *TM);
+
+  // Test function without attributes.
+  LLVMContext Context;
+  Module M("", Context);
+  Function *Func =
+      Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
+                       GlobalValue::ExternalLinkage, "testFunc", &M);
+  Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
+  Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
+
+  auto Range = ST.getWavesPerEU(*Func);
+  EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
+  EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
+  EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
+  EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
+      << CPUName << ' ' << FS;
+
+  // Function with requested 'amdgpu-waves-per-eu' in a valid range.
+  Func->addFnAttr("amdgpu-waves-per-eu", "10,12");
+  Range = ST.getWavesPerEU(*Func);
+  EXPECT_EQ(10u, Range.first) << CPUName << ' ' << FS;
+  EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS;
+}
+
+TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
+  testAbsoluteLimits(
+      "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
+      1, 16, 256);
 }
 
 static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {
diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
index ca8f48bc393ef..6d6f17883a07e 100644
--- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
   Core
   GlobalISel
   MC
+  MIRParser
   Support
   TargetParser
   )



More information about the llvm-commits mailing list