[llvm] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode (PR #130037)

Diana Picus via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 18 04:28:03 PDT 2025


https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/130037

>From 58c9daabc6dbe8bbb21050ee0ead663fbb54fd50 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Mon, 23 Oct 2023 11:46:19 +0200
Subject: [PATCH 1/2] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR
 mode

In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If
the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0`
before S_ENDPGM, but this may incur some performance cost. Therefore
it's better if the compiler proactively generates that instruction.

This patch extends `si-insert-waitcnts` to deallocate the VGPRs via
a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  60 +--
 .../CodeGen/AMDGPU/release-vgprs-gfx12.mir    | 356 ++++++++++++++++++
 2 files changed, 393 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 239f2664f59f3..fab4107a127d0 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1640,17 +1640,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
   }
-  // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
-  // stores. In this case it can be useful to send a message to explicitly
-  // release all VGPRs before the stores have completed, but it is only safe to
-  // do this if:
-  // * there are no outstanding scratch stores
-  // * we are not in Dynamic VGPR mode
+  // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
+  // Technically the hardware will do this on its own if we don't, but that
+  // might cost extra cycles compared to doing it explicitly.
+  // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
+  // have to wait for outstanding VMEM stores. In this case it can be useful to
+  // send a message to explicitly release all VGPRs before the stores have
+  // completed, but it is only safe to do this if there are no outstanding
+  // scratch stores.
   else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
-    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
-        ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
-        !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
+    if (!WCG->isOptNone() &&
+        (ST->isDynamicVGPREnabled() ||
+         (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
+          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
       ReleaseVGPRInsts.insert(&MI);
   }
   // Resolve vm waits before gs-done.
@@ -2593,26 +2597,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
-  // instructions.
+  // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
+  // This is done in different ways depending on how the VGPRs were allocated
+  // (i.e. whether we're in dynamic VGPR mode or not).
   // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
   // waveslot limited kernel runs slower with the deallocation.
-  if (!ReleaseVGPRInsts.empty() &&
-      (MF.getFrameInfo().hasCalls() ||
-       ST->getOccupancyWithNumVGPRs(
-           TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
-           AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+  if (ST->isDynamicVGPREnabled()) {
     for (MachineInstr *MI : ReleaseVGPRInsts) {
-      if (ST->requiresNopBeforeDeallocVGPRs()) {
-        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                TII->get(AMDGPU::S_NOP))
-            .addImm(0);
-      }
       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-              TII->get(AMDGPU::S_SENDMSG))
-          .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+              TII->get(AMDGPU::S_ALLOC_VGPR))
+          .addImm(0);
       Modified = true;
     }
+  } else {
+    if (!ReleaseVGPRInsts.empty() &&
+        (MF.getFrameInfo().hasCalls() ||
+         ST->getOccupancyWithNumVGPRs(
+             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
+             AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+      for (MachineInstr *MI : ReleaseVGPRInsts) {
+        if (ST->requiresNopBeforeDeallocVGPRs()) {
+          BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                  TII->get(AMDGPU::S_NOP))
+              .addImm(0);
+        }
+        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                TII->get(AMDGPU::S_SENDMSG))
+            .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+        Modified = true;
+      }
+    }
   }
   ReleaseVGPRInsts.clear();
   PreheadersToFlush.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
new file mode 100644
index 0000000000000..884b5f8b6f018
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@@ -0,0 +1,356 @@
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
+
+--- |
+  define amdgpu_ps void @tbuffer_store1() { ret void }
+  define amdgpu_ps void @tbuffer_store2() { ret void }
+  define amdgpu_ps void @flat_store() { ret void }
+  define amdgpu_ps void @global_store() { ret void }
+  define amdgpu_ps void @buffer_store_format() { ret void }
+  define amdgpu_ps void @ds_write_b32() { ret void }
+  define amdgpu_ps void @global_store_dword() { ret void }
+  define amdgpu_ps void @multiple_basic_blocks1() { ret void }
+  define amdgpu_ps void @multiple_basic_blocks2() { ret void }
+  define amdgpu_ps void @multiple_basic_blocks3() { ret void }
+  define amdgpu_ps void @recursive_loop() { ret void }
+  define amdgpu_ps void @recursive_loop_vmem() { ret void }
+  define amdgpu_ps void @image_store() { ret void }
+  define amdgpu_ps void @scratch_store() { ret void }
+  define amdgpu_ps void @buffer_atomic() { ret void }
+  define amdgpu_ps void @flat_atomic() { ret void }
+  define amdgpu_ps void @global_atomic() { ret void }
+  define amdgpu_ps void @image_atomic() { ret void }
+  define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
+...
+
+---
+name:            tbuffer_store1
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: tbuffer_store1
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            tbuffer_store2
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: tbuffer_store2
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            flat_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: flat_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+---
+name:            global_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAIT_STORECNT 0
+    S_ENDPGM 0
+...
+
+---
+name:            buffer_store_format
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: buffer_store_format
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            ds_write_b32
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: ds_write_b32
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr1 = IMPLICIT_DEF
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
+    S_ENDPGM 0
+
+...
+---
+name:            global_store_dword
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: global_store_dword
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            multiple_basic_blocks1
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks1
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name:            multiple_basic_blocks2
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks2
+  ; CHECK: bb.2:
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.2
+
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            multiple_basic_blocks3
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks3
+  ; CHECK: bb.4:
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.4
+
+    S_BRANCH %bb.4
+
+  bb.3:
+    successors: %bb.4
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_ENDPGM 0
+...
+
+---
+name:            recursive_loop
+body:             |
+  ; CHECK-LABEL: name: recursive_loop
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            recursive_loop_vmem
+body:             |
+  ; CHECK-LABEL: name: recursive_loop_vmem
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            image_store
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: image_store
+  ; CHECK-NOT: S_SENDMSG 3
+  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
+  S_ENDPGM 0
+...
+
+---
+name:            scratch_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: scratch_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
+    SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+---
+name:            buffer_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: buffer_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            flat_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: flat_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+
+---
+name:            global_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            image_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: image_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            global_store_optnone
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_store_optnone
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK-NOT: S_ALLOC_VGPR
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAIT_STORECNT 0
+    S_ENDPGM 0
+...

>From 6b7d1740c5c794040999cb36d62be565aebb5b91 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Thu, 6 Mar 2025 13:49:46 +0100
Subject: [PATCH 2/2] Remove useless -O2 from test

---
 llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
index 884b5f8b6f018..d465bf95fbfbe 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@@ -1,5 +1,5 @@
-# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
-# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
 
 --- |
   define amdgpu_ps void @tbuffer_store1() { ret void }



More information about the llvm-commits mailing list