[llvm] eb74917 - [AMDGPU] Reimplement the GFX11 early release VGPRs optimization

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 19 09:13:12 PDT 2023


Author: Jay Foad
Date: 2023-06-19T17:12:54+01:00
New Revision: eb7491769a511476ffae596a8f58abfd12ed46d4

URL: https://github.com/llvm/llvm-project/commit/eb7491769a511476ffae596a8f58abfd12ed46d4
DIFF: https://github.com/llvm/llvm-project/commit/eb7491769a511476ffae596a8f58abfd12ed46d4.diff

LOG: [AMDGPU] Reimplement the GFX11 early release VGPRs optimization

Implement this optimization in SIInsertWaitcnts, where we already have
information about whether there might be outstanding VMEM store
instructions. This has the following advantages:
- Correctly handles atomics-with-return.
- Correctly handles call instructions.
- Should be faster because it does not require running a separate pass.

Differential Revision: https://reviews.llvm.org/D153279

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/cc-update.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir

Removed: 
    llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
    llvm/test/CodeGen/AMDGPU/release-vgprs.mir


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 07d50df317b77..e4d70abd15f71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -316,9 +316,6 @@ extern char &SIMemoryLegalizerID;
 void initializeSIModeRegisterPass(PassRegistry&);
 extern char &SIModeRegisterID;
 
-void initializeAMDGPUReleaseVGPRsPass(PassRegistry &);
-extern char &AMDGPUReleaseVGPRsID;
-
 void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
 extern char &AMDGPUInsertDelayAluID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
deleted file mode 100644
index b7521540c0205..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- AMDGPUReleaseVGPRs.cpp - Automatically release vgprs on GFX11+ -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Insert S_SENDMSG instructions to release vgprs on GFX11+.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include <optional>
-using namespace llvm;
-
-#define DEBUG_TYPE "release-vgprs"
-
-namespace {
-
-class AMDGPUReleaseVGPRs : public MachineFunctionPass {
-public:
-  static char ID;
-
-  AMDGPUReleaseVGPRs() : MachineFunctionPass(ID) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  // Track if the last instruction referencing a vgpr in a MBB is a VMEM
-  // store. Because this pass is late in the pipeline, it is expected that the
-  // last vgpr use will likely be one of vmem store, ds, exp.
-  // Loads and others vgpr operations would have been
-  // deleted by this point, except for complex control flow involving loops.
-  // This is why we are just testing the type of instructions rather
-  // than the operands.
-  class LastVGPRUseIsVMEMStore {
-    BitVector BlockVMEMStore;
-
-    static std::optional<bool>
-    lastVGPRUseIsStore(const MachineBasicBlock &MBB) {
-      for (auto &MI : reverse(MBB.instrs())) {
-        // If it's a VMEM store, a VGPR will be used, return true.
-        if ((SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI)) &&
-            MI.mayStore())
-          return true;
-
-        // If it's referencing a VGPR but is not a VMEM store, return false.
-        if (SIInstrInfo::isDS(MI) || SIInstrInfo::isEXP(MI) ||
-            SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI) ||
-            SIInstrInfo::isVALU(MI))
-          return false;
-      }
-      // Wait until the values are propagated from the predecessors
-      return std::nullopt;
-    }
-
-  public:
-    LastVGPRUseIsVMEMStore(const MachineFunction &MF)
-        : BlockVMEMStore(MF.getNumBlockIDs()) {
-
-      df_iterator_default_set<const MachineBasicBlock *> Visited;
-      SmallVector<const MachineBasicBlock *> EndWithVMEMStoreBlocks;
-
-      for (const auto &MBB : MF) {
-        auto LastUseIsStore = lastVGPRUseIsStore(MBB);
-        if (!LastUseIsStore.has_value())
-          continue;
-
-        if (*LastUseIsStore) {
-          EndWithVMEMStoreBlocks.push_back(&MBB);
-        } else {
-          Visited.insert(&MBB);
-        }
-      }
-
-      for (const auto *MBB : EndWithVMEMStoreBlocks) {
-        for (const auto *Succ : depth_first_ext(MBB, Visited)) {
-          BlockVMEMStore[Succ->getNumber()] = true;
-        }
-      }
-    }
-
-    // Return true if the last instruction referencing a vgpr in this MBB
-    // is a VMEM store, otherwise return false.
-    bool isLastVGPRUseVMEMStore(const MachineBasicBlock &MBB) const {
-      return BlockVMEMStore[MBB.getNumber()];
-    }
-  };
-
-  static bool
-  runOnMachineBasicBlock(MachineBasicBlock &MBB, const SIInstrInfo *SII,
-                         const LastVGPRUseIsVMEMStore &BlockVMEMStore) {
-
-    bool Changed = false;
-
-    for (auto &MI : MBB.terminators()) {
-      // Look for S_ENDPGM instructions
-      if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-          MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
-        // If the last instruction using a VGPR in the block is a VMEM store,
-        // release VGPRs. The VGPRs release will be placed just before ending
-        // the program
-        if (BlockVMEMStore.isLastVGPRUseVMEMStore(MBB)) {
-          BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_SENDMSG))
-              .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
-          Changed = true;
-        }
-      }
-    }
-
-    return Changed;
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    Function &F = MF.getFunction();
-    if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
-      return false;
-
-    // This pass only runs on GFX11+
-    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-    if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
-      return false;
-
-    LLVM_DEBUG(dbgs() << "AMDGPUReleaseVGPRs running on " << MF.getName()
-                      << "\n");
-
-    const SIInstrInfo *SII = ST.getInstrInfo();
-    LastVGPRUseIsVMEMStore BlockVMEMStore(MF);
-
-    bool Changed = false;
-    for (auto &MBB : MF) {
-      Changed |= runOnMachineBasicBlock(MBB, SII, BlockVMEMStore);
-    }
-
-    return Changed;
-  }
-};
-
-} // namespace
-
-char AMDGPUReleaseVGPRs::ID = 0;
-
-char &llvm::AMDGPUReleaseVGPRsID = AMDGPUReleaseVGPRs::ID;
-
-INITIALIZE_PASS(AMDGPUReleaseVGPRs, DEBUG_TYPE, "Release VGPRs", false, false)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c3f7d753e1979..704f2e0b19bb9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -405,7 +405,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteUndefForPHIPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
-  initializeAMDGPUReleaseVGPRsPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
@@ -1431,9 +1430,6 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
-  if (getOptLevel() > CodeGenOpt::Less)
-    addPass(&AMDGPUReleaseVGPRsID);
-
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
     addPass(&AMDGPUInsertDelayAluID);
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 8df156f24dcb6..ae88d934ad52b 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -88,7 +88,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPURegBankCombiner.cpp
   AMDGPURegBankSelect.cpp
   AMDGPURegisterBankInfo.cpp
-  AMDGPUReleaseVGPRs.cpp
   AMDGPURemoveIncompatibleFunctions.cpp
   AMDGPUResourceUsageAnalysis.cpp
   AMDGPURewriteOutArguments.cpp

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6f0be07a3e23e..05247d3ff1603 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -386,6 +386,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   bool ForceEmitZeroWaitcnts;
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
 
+  // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
+  // message.
+  DenseSet<MachineInstr *> ReleaseVGPRInsts;
+
 public:
   static char ID;
 
@@ -1032,6 +1036,15 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(allZeroWaitcnt());
   }
+  // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
+  // stores. In this case it can be useful to send a message to explicitly
+  // release all VGPRs before the stores have completed.
+  else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+           MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+    if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+        ScoreBrackets.getScoreRange(VS_CNT) != 0)
+      ReleaseVGPRInsts.insert(&MI);
+  }
   // Resolve vm waits before gs-done.
   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
@@ -1930,5 +1943,14 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
+  // instructions.
+  for (MachineInstr *MI : ReleaseVGPRInsts) {
+    BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
+        .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+    Modified = true;
+  }
+  ReleaseVGPRInsts.clear();
+
   return Modified;
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index f4a3234c73ee6..2f79135fe610d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -159,7 +159,6 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
index d95f8eb5d7a45..00f6b7ac9342e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX900 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10PLUS %s
 
 define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
 ; GFX6-LABEL: atomic_swap_i32_1d:
@@ -1059,32 +1059,18 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
 ; GFX90A-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
 ; GFX90A-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: atomic_cmpswap_i32_1d_no_return:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s0, s2
-; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    s_mov_b32 s4, s6
-; GFX10-NEXT:    s_mov_b32 s5, s7
-; GFX10-NEXT:    s_mov_b32 s6, s8
-; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: atomic_cmpswap_i32_1d_no_return:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s0, s2
-; GFX11-NEXT:    s_mov_b32 s1, s3
-; GFX11-NEXT:    s_mov_b32 s2, s4
-; GFX11-NEXT:    s_mov_b32 s3, s5
-; GFX11-NEXT:    s_mov_b32 s4, s6
-; GFX11-NEXT:    s_mov_b32 s5, s7
-; GFX11-NEXT:    s_mov_b32 s6, s8
-; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_mov_b32 s0, s2
+; GFX10PLUS-NEXT:    s_mov_b32 s1, s3
+; GFX10PLUS-NEXT:    s_mov_b32 s2, s4
+; GFX10PLUS-NEXT:    s_mov_b32 s3, s5
+; GFX10PLUS-NEXT:    s_mov_b32 s4, s6
+; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
+; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
+; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    s_endpgm
 main_body:
   %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -2760,32 +2746,18 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
 ; GFX90A-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
 ; GFX90A-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: atomic_cmpswap_i64_1d_no_return:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s0, s2
-; GFX10-NEXT:    s_mov_b32 s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, s4
-; GFX10-NEXT:    s_mov_b32 s3, s5
-; GFX10-NEXT:    s_mov_b32 s4, s6
-; GFX10-NEXT:    s_mov_b32 s5, s7
-; GFX10-NEXT:    s_mov_b32 s6, s8
-; GFX10-NEXT:    s_mov_b32 s7, s9
-; GFX10-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: atomic_cmpswap_i64_1d_no_return:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s0, s2
-; GFX11-NEXT:    s_mov_b32 s1, s3
-; GFX11-NEXT:    s_mov_b32 s2, s4
-; GFX11-NEXT:    s_mov_b32 s3, s5
-; GFX11-NEXT:    s_mov_b32 s4, s6
-; GFX11-NEXT:    s_mov_b32 s5, s7
-; GFX11-NEXT:    s_mov_b32 s6, s8
-; GFX11-NEXT:    s_mov_b32 s7, s9
-; GFX11-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_mov_b32 s0, s2
+; GFX10PLUS-NEXT:    s_mov_b32 s1, s3
+; GFX10PLUS-NEXT:    s_mov_b32 s2, s4
+; GFX10PLUS-NEXT:    s_mov_b32 s3, s5
+; GFX10PLUS-NEXT:    s_mov_b32 s4, s6
+; GFX10PLUS-NEXT:    s_mov_b32 s5, s7
+; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
+; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
+; GFX10PLUS-NEXT:    image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    s_endpgm
 main_body:
   %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index ded7081033f0e..acf69e7e7b2ad 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4382,7 +4382,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-NEXT:    scratch_store_b32 off, v31, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v32i32:
@@ -4552,7 +4551,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-NEXT:    scratch_store_b32 off, v32, s4
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v32i32_i32:
@@ -4926,7 +4924,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:

diff  --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index a3e22a3bc7db6..fdb8bec944cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -238,7 +238,6 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
 ; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 
 entry:
@@ -517,7 +516,6 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
 ; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
 ; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 entry:
   %x = alloca i32, align 4, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index fe4d0224b038b..4d9ed0a784bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -1026,7 +1026,6 @@
 ; GCN-O2-NEXT:        SI Final Branch Preparation
 ; GCN-O2-NEXT:        SI peephole optimizations
 ; GCN-O2-NEXT:        Post RA hazard recognizer
-; GCN-O2-NEXT:        Release VGPRs
 ; GCN-O2-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O2-NEXT:        Branch relaxation pass
 ; GCN-O2-NEXT:        Register Usage Information Collector Pass
@@ -1350,7 +1349,6 @@
 ; GCN-O3-NEXT:        SI Final Branch Preparation
 ; GCN-O3-NEXT:        SI peephole optimizations
 ; GCN-O3-NEXT:        Post RA hazard recognizer
-; GCN-O3-NEXT:        Release VGPRs
 ; GCN-O3-NEXT:        AMDGPU Insert Delay ALU
 ; GCN-O3-NEXT:        Branch relaxation pass
 ; GCN-O3-NEXT:        Register Usage Information Collector Pass

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index cbf0ca740c76b..4b16e9b2ce846 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -591,6 +591,7 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    exp mrt0 v0, v0, v0, v0 done
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %tmp = shl i32 %index, 4

diff  --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
deleted file mode 100644
index 7adb2e4cac416..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ /dev/null
@@ -1,411 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-
-# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=release-vgprs -verify-machineinstrs -o - %s | FileCheck %s
-
---- |
-  define amdgpu_ps void @tbuffer_store1() { ret void }
-  define amdgpu_ps void @tbuffer_store2() { ret void }
-  define amdgpu_ps void @flat_store() { ret void }
-  define amdgpu_ps void @global_store() { ret void }
-  define amdgpu_ps void @buffer_store_format() { ret void }
-  define amdgpu_ps void @ds_write_b32() { ret void }
-  define amdgpu_ps void @global_store_dword() { ret void }
-  define amdgpu_ps void @multiple_basic_blocks1() { ret void }
-  define amdgpu_ps void @multiple_basic_blocks2() { ret void }
-  define amdgpu_ps void @multiple_basic_blocks3() { ret void }
-  define amdgpu_ps void @recursive_loop() { ret void }
-  define amdgpu_ps void @recursive_loop_vmem() { ret void }
-  define amdgpu_ps void @image_store() { ret void }
-  define amdgpu_ps void @scratch_store() { ret void }
-  define amdgpu_ps void @buffer_atomic() { ret void }
-  define amdgpu_ps void @flat_atomic() { ret void }
-  define amdgpu_ps void @global_atomic() { ret void }
-  define amdgpu_ps void @image_atomic() { ret void }
-...
-
----
-name:            tbuffer_store1
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: tbuffer_store1
-    ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name:            tbuffer_store2
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: tbuffer_store2
-    ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    S_ENDPGM 0
-...
-
----
-name:            flat_store
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: flat_store
-    ; CHECK: FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
-    S_ENDPGM 0
-...
-
----
-name:            global_store
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: global_store
-    ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    ; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    S_WAITCNT_VSCNT undef $sgpr_null, 0
-    S_ENDPGM 0
-...
-
----
-name:            buffer_store_format
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: buffer_store_format
-    ; CHECK: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name:            ds_write_b32
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: ds_write_b32
-    ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
-    ; CHECK-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0
-    renamable $vgpr0 = IMPLICIT_DEF
-    renamable $vgpr1 = IMPLICIT_DEF
-    DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec
-    S_ENDPGM 0
-
-...
----
-name:            global_store_dword
-body:             |
-  bb.0:
-    liveins: $vgpr0, $sgpr0_sgpr1
-
-    ; CHECK-LABEL: name: global_store_dword
-    ; CHECK: liveins: $vgpr0, $sgpr0_sgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
-    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name:            multiple_basic_blocks1
-body:             |
-  ; CHECK-LABEL: name: multiple_basic_blocks1
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0
-  bb.0:
-    successors: %bb.1
-
-    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-    S_BRANCH %bb.1
-
-  bb.1:
-    successors: %bb.1, %bb.2
-
-    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-    S_BRANCH %bb.2
-
-  bb.2:
-    S_ENDPGM 0
-
-...
-
-
-# One block has a VMEM store as the last instruction, we should release the VGPRS
-...
----
-name:            multiple_basic_blocks2
-body:             |
-  ; CHECK-LABEL: name: multiple_basic_blocks2
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; CHECK-NEXT:   S_ENDPGM 0
-  bb.0:
-    successors: %bb.2
-
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    S_BRANCH %bb.2
-
-  bb.1:
-    successors: %bb.2
-
-    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-    S_BRANCH %bb.2
-
-  bb.2:
-    S_ENDPGM 0
-...
-
-
-# One parent block has a VMEM store, release VGPRs
----
-name:            multiple_basic_blocks3
-body:             |
-  ; CHECK-LABEL: name: multiple_basic_blocks3
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_BRANCH %bb.4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; CHECK-NEXT:   S_ENDPGM 0
-  bb.0:
-    successors: %bb.2
-
-    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-    S_BRANCH %bb.2
-
-  bb.1:
-    successors: %bb.2
-
-    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    S_BRANCH %bb.2
-
-  bb.2:
-    successors: %bb.4
-
-    S_BRANCH %bb.4
-
-  bb.3:
-    successors: %bb.4
-
-    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
-    S_BRANCH %bb.4
-
-  bb.4:
-    S_ENDPGM 0
-...
-
----
-name:            recursive_loop
-body:             |
-  ; CHECK-LABEL: name: recursive_loop
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0
-  bb.0:
-    successors: %bb.1
-
-    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-    S_BRANCH %bb.1
-
-  bb.1:
-    successors: %bb.1, %bb.2
-
-    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-    S_BRANCH %bb.2
-
-  bb.2:
-    S_ENDPGM 0
-...
-
----
-name:            recursive_loop_vmem
-body:             |
-  ; CHECK-LABEL: name: recursive_loop_vmem
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
-  ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; CHECK-NEXT:   S_ENDPGM 0
-  bb.0:
-    successors: %bb.1
-
-    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-    S_BRANCH %bb.1
-
-  bb.1:
-    successors: %bb.1, %bb.2
-
-    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
-    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-    S_BRANCH %bb.2
-
-  bb.2:
-    S_ENDPGM 0
-...
-
----
-name:            image_store
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: image_store
-    ; CHECK: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-  IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
-  S_ENDPGM 0
-...
-
----
-name:            scratch_store
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: scratch_store
-    ; CHECK: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
-    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
-    SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
-    S_ENDPGM 0
-...
-
----
-name:            buffer_atomic
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: buffer_atomic
-    ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
-    S_ENDPGM 0
-...
-
----
-name:            flat_atomic
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: flat_atomic
-    ; CHECK: renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
-    S_ENDPGM 0
-...
-
-
----
-name:            global_atomic
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: global_atomic
-    ; CHECK: renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
-    S_ENDPGM 0
-...
-
----
-name:            image_atomic
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: image_atomic
-    ; CHECK: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
-    ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; CHECK-NEXT: S_ENDPGM 0
-    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
-    S_ENDPGM 0
-...

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
index 5f04e8cff880e..97dc6b60c0b4a 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
@@ -31,6 +31,7 @@ body:             |
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
     ; GFX11-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     S_WAITCNT_VSCNT undef $sgpr_null, 0
@@ -69,6 +70,7 @@ body:             |
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
     ; GFX11-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     S_WAITCNT_VSCNT undef $sgpr_null, 1
@@ -109,6 +111,7 @@ body:             |
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
     ; GFX11-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     S_WAITCNT 112
@@ -148,6 +151,7 @@ body:             |
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
     ; GFX11-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     S_WAITCNT_VSCNT undef $sgpr_null, 0
@@ -190,6 +194,7 @@ body:             |
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
     ; GFX11-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     S_WAITCNT 0


        


More information about the llvm-commits mailing list