[llvm] [AMDGPU] Skip VGPR deallocation for waveslot limited kernels (PR #112765)

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 17 14:50:12 PDT 2024


https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/112765

>From ae13d8a4e1ef6394902b27380ef19a2ffc33e3ea Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 17 Oct 2024 11:26:33 -0700
Subject: [PATCH 1/2] [AMDGPU] Skip VGPR deallocation for waveslot limited
 kernels

MSG_DEALLOC_VGPRS slows down very small waveslot limited kernels.
It's been identified this message is only really needed for VGPR
limited kernels. A kernel becomes VGPR limited if a total number
of VGPRs per SIMD / number of used VGPRs is more than a number of
wave slots.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  25 +-
 .../CodeGen/AMDGPU/GlobalISel/addsubu64.ll    |  12 -
 .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll  |  28 -
 .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll  |  30 -
 .../AMDGPU/GlobalISel/extractelement.ll       |  16 -
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |  24 -
 .../AMDGPU/GlobalISel/insertelement.i16.ll    |  62 --
 .../AMDGPU/GlobalISel/insertelement.i8.ll     |  62 --
 .../AMDGPU/GlobalISel/insertelement.large.ll  |   2 -
 .../AMDGPU/GlobalISel/insertelement.ll        |  18 -
 .../GlobalISel/llvm.amdgcn.ballot.i32.ll      |   2 -
 .../GlobalISel/llvm.amdgcn.div.scale.ll       |  48 --
 .../GlobalISel/llvm.amdgcn.end.cf.i32.ll      |   2 -
 .../llvm.amdgcn.global.atomic.csub.ll         |   4 -
 .../GlobalISel/llvm.amdgcn.if.break.i32.ll    |   2 -
 .../GlobalISel/llvm.amdgcn.image.store.2d.ll  |  44 -
 .../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll  |   4 -
 .../GlobalISel/llvm.amdgcn.update.dpp.ll      |  12 -
 .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll  |  52 --
 .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll  |  52 --
 .../AMDGPU/GlobalISel/load-unaligned.ll       |   2 -
 .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll |  32 -
 .../AMDGPU/GlobalISel/mul-known-bits.i64.ll   |  22 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |  16 -
 .../AMDGPU/GlobalISel/shl-ext-reduce.ll       |   4 -
 ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll |  56 --
 .../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll   |  44 -
 .../GlobalISel/wmma-gfx12-w32-iu-modifiers.ll |  36 -
 .../wmma-gfx12-w32-swmmac-index_key.ll        |  20 -
 .../AMDGPU/GlobalISel/wmma-gfx12-w32.ll       |  44 -
 ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll |  56 --
 .../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll   |  44 -
 .../GlobalISel/wmma-gfx12-w64-iu-modifiers.ll |  36 -
 .../wmma-gfx12-w64-swmmac-index_key.ll        |  22 -
 .../AMDGPU/GlobalISel/wmma-gfx12-w64.ll       |  44 -
 llvm/test/CodeGen/AMDGPU/add.ll               |  40 -
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |  26 -
 .../AMDGPU/atomic_optimizations_buffer.ll     |  72 --
 .../atomic_optimizations_global_pointer.ll    | 128 ---
 .../atomic_optimizations_local_pointer.ll     | 192 -----
 .../atomic_optimizations_pixelshader.ll       |   8 -
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |  64 --
 .../atomic_optimizations_struct_buffer.ll     |  80 --
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  |   8 -
 llvm/test/CodeGen/AMDGPU/bitreverse.ll        |  40 -
 llvm/test/CodeGen/AMDGPU/br_cc.f16.ll         |   8 -
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll |  78 +-
 llvm/test/CodeGen/AMDGPU/bswap.ll             |  14 -
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |  10 -
 .../CodeGen/AMDGPU/calling-conventions.ll     |  62 --
 .../test/CodeGen/AMDGPU/carryout-selection.ll |  34 -
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |   2 -
 llvm/test/CodeGen/AMDGPU/clamp-modifier.ll    |  40 -
 llvm/test/CodeGen/AMDGPU/clamp.ll             | 196 -----
 .../AMDGPU/commute-compares-scalar-float.ll   |  64 --
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |  32 -
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     |  38 -
 .../AMDGPU/divergence-driven-buildvector.ll   |  12 -
 llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll     |   6 -
 .../expand-scalar-carry-out-select-user.ll    |   2 -
 .../CodeGen/AMDGPU/extract_vector_elt-f16.ll  |  22 -
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |  22 -
 llvm/test/CodeGen/AMDGPU/fadd.f16.ll          |  48 --
 .../fast-unaligned-load-store.global.ll       |  12 -
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |  94 ---
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     | 220 -----
 llvm/test/CodeGen/AMDGPU/fcmp.f16.ll          |  58 --
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  44 -
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |  38 -
 llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll     |  32 -
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |  34 -
 llvm/test/CodeGen/AMDGPU/fdiv.ll              |  36 -
 .../AMDGPU/fix-sgpr-copies-nondeterminism.ll  |   2 -
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |  24 -
 llvm/test/CodeGen/AMDGPU/fma-combine.ll       |  90 ---
 llvm/test/CodeGen/AMDGPU/fmax3.ll             |   8 -
 llvm/test/CodeGen/AMDGPU/fmaximum.ll          |   4 -
 llvm/test/CodeGen/AMDGPU/fmed3.ll             | 130 ---
 llvm/test/CodeGen/AMDGPU/fmin3.ll             |  12 -
 llvm/test/CodeGen/AMDGPU/fminimum.ll          |   4 -
 .../AMDGPU/fmul-2-combine-multi-use.ll        |  32 -
 llvm/test/CodeGen/AMDGPU/fmul.f16.ll          |  16 -
 llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll       |  98 ---
 llvm/test/CodeGen/AMDGPU/fnearbyint.ll        |  14 -
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |  22 -
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |   2 -
 llvm/test/CodeGen/AMDGPU/fneg.f16.ll          |  22 -
 llvm/test/CodeGen/AMDGPU/fneg.ll              |  28 -
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll |  20 -
 llvm/test/CodeGen/AMDGPU/fp-classify.ll       |  30 -
 .../AMDGPU/fp-min-max-buffer-atomics.ll       |  30 -
 .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll   |  20 -
 .../AMDGPU/fp-min-max-num-global-atomics.ll   |   8 -
 llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll      |   4 -
 llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll      |   4 -
 llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll      |   4 -
 llvm/test/CodeGen/AMDGPU/fpext.f16.ll         |  56 --
 llvm/test/CodeGen/AMDGPU/fptosi.f16.ll        |  28 -
 llvm/test/CodeGen/AMDGPU/fptoui.f16.ll        |  28 -
 llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll       |  40 -
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |  28 -
 llvm/test/CodeGen/AMDGPU/frem.ll              |  56 --
 llvm/test/CodeGen/AMDGPU/fshl.ll              |  14 -
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  12 -
 llvm/test/CodeGen/AMDGPU/fsub.f16.ll          |  12 -
 .../AMDGPU/gfx12_scalar_subword_loads.ll      |  92 ---
 .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll   |   4 -
 .../CodeGen/AMDGPU/global-saddr-atomics.ll    |  16 -
 .../test/CodeGen/AMDGPU/global-saddr-store.ll | 244 ------
 .../test/CodeGen/AMDGPU/global_atomics_i64.ll | 114 ---
 .../AMDGPU/global_atomics_scan_fadd.ll        |  24 -
 .../AMDGPU/global_atomics_scan_fmax.ll        |  40 -
 .../AMDGPU/global_atomics_scan_fmin.ll        |  40 -
 llvm/test/CodeGen/AMDGPU/half.ll              |  88 --
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  16 -
 llvm/test/CodeGen/AMDGPU/idot4s.ll            |  34 -
 llvm/test/CodeGen/AMDGPU/idot4u.ll            |  62 --
 .../test/CodeGen/AMDGPU/image-load-d16-tfe.ll |  14 -
 llvm/test/CodeGen/AMDGPU/imm16.ll             |  66 --
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  70 --
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll  |   2 -
 .../AMDGPU/llvm.amdgcn.bitreplicate.ll        |   2 -
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll |  10 -
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll   |  18 -
 .../AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll   |   4 -
 .../AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll    |   4 -
 .../AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll   |   4 -
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    | 188 -----
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    |  96 ---
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll     |   4 -
 .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll       |   2 -
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll      |   4 -
 ...vm.amdgcn.global.atomic.ordered.add.b64.ll |   4 -
 .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll  |   8 -
 .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll  |   8 -
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    | 128 ---
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  68 --
 .../AMDGPU/llvm.amdgcn.image.a16.dim.ll       | 326 ++------
 .../AMDGPU/llvm.amdgcn.image.a16.encode.ll    |  76 --
 .../CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll   | 349 ++------
 .../AMDGPU/llvm.amdgcn.image.store.a16.d16.ll |  48 --
 .../AMDGPU/llvm.amdgcn.image.store.a16.ll     |  48 --
 .../AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll  |  16 -
 .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll  |  28 -
 .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll  |   2 -
 .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll   |   2 -
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    | 752 ------------------
 .../AMDGPU/llvm.amdgcn.permlane16.var.ll      | 104 ---
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll  |   6 -
 .../AMDGPU/llvm.amdgcn.permlane64.ptr.ll      |  16 -
 .../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll    |   4 -
 .../AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll |  40 -
 .../AMDGPU/llvm.amdgcn.raw.buffer.store.ll    |  48 --
 .../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll  |   8 -
 .../llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll  |   8 -
 .../llvm.amdgcn.raw.ptr.tbuffer.store.ll      |  22 -
 .../llvm.amdgcn.raw.tbuffer.store.d16.ll      |  18 -
 .../AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll   |  44 -
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll |  32 -
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll |  32 -
 .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll   |   6 -
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      |  96 ---
 .../AMDGPU/llvm.amdgcn.s.buffer.load.ll       |   4 -
 .../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll |  24 -
 .../llvm.amdgcn.struct.buffer.load.tfe.ll     |  40 -
 .../AMDGPU/llvm.amdgcn.struct.buffer.store.ll |  34 -
 ...lvm.amdgcn.struct.ptr.tbuffer.store.d16.ll |   8 -
 .../llvm.amdgcn.struct.ptr.tbuffer.store.ll   |  28 -
 .../llvm.amdgcn.struct.tbuffer.store.d16.ll   |  18 -
 .../llvm.amdgcn.struct.tbuffer.store.ll       |  56 --
 .../CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll     |   2 -
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll     |  52 --
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll     |  52 --
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll   |   4 -
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   |  76 --
 llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll     |   8 -
 llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll      |   4 -
 llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll    |   8 -
 llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll  |  16 -
 llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll   |   2 -
 .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll    |   2 -
 .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll     |   2 -
 llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll   |   4 -
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  16 -
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  16 -
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  16 -
 llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll   |  18 -
 llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll   |  18 -
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll         |   8 -
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |   4 -
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |  12 -
 llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll |   2 -
 llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll      |   4 -
 llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll     |   4 -
 llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll    |   4 -
 .../AMDGPU/load-constant-always-uniform.ll    |   4 -
 llvm/test/CodeGen/AMDGPU/load-constant-f32.ll |   2 -
 llvm/test/CodeGen/AMDGPU/load-constant-f64.ll |   4 -
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |  88 --
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |  78 --
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  50 --
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |  12 -
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  | 104 ---
 .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll    |   8 -
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |   4 -
 .../lower-work-group-id-intrinsics-hsa.ll     |   4 -
 .../lower-work-group-id-intrinsics-pal.ll     |  70 +-
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |  16 -
 llvm/test/CodeGen/AMDGPU/mad.u16.ll           |   2 -
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |   4 -
 llvm/test/CodeGen/AMDGPU/madak.ll             |  40 -
 .../match-perm-extract-vector-elt-bug.ll      |   2 -
 .../CodeGen/AMDGPU/max-hard-clause-length.ll  |   4 -
 llvm/test/CodeGen/AMDGPU/min.ll               |  70 --
 llvm/test/CodeGen/AMDGPU/minimummaximum.ll    |   8 -
 llvm/test/CodeGen/AMDGPU/minmax.ll            |  16 -
 llvm/test/CodeGen/AMDGPU/mul.ll               |  76 --
 .../CodeGen/AMDGPU/offset-split-global.ll     | 142 ----
 llvm/test/CodeGen/AMDGPU/omod.ll              |  90 ---
 .../CodeGen/AMDGPU/release-vgprs-dbg-loc.mir  |   4 +-
 llvm/test/CodeGen/AMDGPU/release-vgprs.mir    | 136 ++--
 llvm/test/CodeGen/AMDGPU/rotl.ll              |   6 -
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   6 -
 llvm/test/CodeGen/AMDGPU/saddo.ll             |  12 -
 llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll | 112 ---
 llvm/test/CodeGen/AMDGPU/select.f16.ll        |  20 -
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |  16 -
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll |  74 --
 llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll    |  16 -
 llvm/test/CodeGen/AMDGPU/sitofp.f16.ll        |  20 -
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll      |   6 -
 llvm/test/CodeGen/AMDGPU/sub.ll               |  26 -
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |  26 -
 llvm/test/CodeGen/AMDGPU/trap-abis.ll         |   6 -
 llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll    |  16 -
 llvm/test/CodeGen/AMDGPU/uitofp.f16.ll        |  20 -
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  46 --
 llvm/test/CodeGen/AMDGPU/v_madak_f16.ll       |   4 -
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |   8 -
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |   6 -
 .../AMDGPU/vgpr-mark-last-scratch-load.ll     |   4 -
 .../wait-before-stores-with-scope_sys.ll      |   4 -
 llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll  |  24 -
 ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll |  56 --
 .../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll |  44 -
 .../AMDGPU/wmma-gfx12-w32-iu-modifiers.ll     |  36 -
 .../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll |  20 -
 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll    |  44 -
 ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll |  56 --
 .../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll |  44 -
 .../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll     |  36 -
 .../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll |  22 -
 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll    |  44 -
 llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll  |  44 -
 llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll  |  44 -
 .../AMDGPU/workgroup-id-in-arch-sgprs.ll      |   6 -
 256 files changed, 335 insertions(+), 9649 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8f1757db8a85f5..c2806f9cd88199 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2606,15 +2606,24 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
   // instructions.
-  for (MachineInstr *MI : ReleaseVGPRInsts) {
-    if (ST->requiresNopBeforeDeallocVGPRs()) {
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
-          .addImm(0);
+  // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
+  // waveslot limited kernel runs slower with the deallocation.
+  if (!ReleaseVGPRInsts.empty() &&
+      (MF.getFrameInfo().hasCalls() ||
+       AMDGPU::IsaInfo::getTotalNumVGPRs(ST) /
+               TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass) <
+           AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+    for (MachineInstr *MI : ReleaseVGPRInsts) {
+      if (ST->requiresNopBeforeDeallocVGPRs()) {
+        BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                TII->get(AMDGPU::S_NOP))
+            .addImm(0);
+      }
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+              TII->get(AMDGPU::S_SENDMSG))
+          .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+      Modified = true;
     }
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-            TII->get(AMDGPU::S_SENDMSG))
-        .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
-    Modified = true;
   }
   ReleaseVGPRInsts.clear();
   PreheadersToFlush.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index 359c1e53de99e3..ad3c588f575512 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -15,8 +15,6 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_add_u64:
@@ -30,8 +28,6 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %add = add i64 %a, %b
@@ -45,8 +41,6 @@ define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GCN-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
 ; GCN-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GCN-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 entry:
   %add = add i64 %a, %b
@@ -67,8 +61,6 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_sub_u64:
@@ -82,8 +74,6 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %sub = sub i64 %a, %b
@@ -97,8 +87,6 @@ define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GCN-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, v4
 ; GCN-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GCN-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 entry:
   %sub = sub i64 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 705bcbddf227a6..e28a1efb75404d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -84,8 +84,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
@@ -163,8 +161,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -353,8 +349,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
@@ -431,8 +425,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -510,8 +502,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -797,8 +787,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -2302,8 +2290,6 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
@@ -2390,8 +2376,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
@@ -2474,8 +2458,6 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -2679,8 +2661,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
@@ -2762,8 +2742,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -2846,8 +2824,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -3153,8 +3129,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
@@ -3334,8 +3308,6 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v3, v2, s[2:3]
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index b3a7e65f771c43..d63044d7cec6d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -84,8 +84,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
@@ -163,8 +161,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -353,8 +349,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
@@ -431,8 +425,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -510,8 +502,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -797,8 +787,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
@@ -967,8 +955,6 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
@@ -1055,8 +1041,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
@@ -1139,8 +1123,6 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1344,8 +1326,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
@@ -1427,8 +1407,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
@@ -1511,8 +1489,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -1818,8 +1794,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
@@ -2680,8 +2654,6 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v3, v2, s[2:3]
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
@@ -3541,8 +3513,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 34efb089b72bf1..ca6e5df43a0434 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -480,8 +480,6 @@ define amdgpu_ps void @dyn_extract_v8i64_const_s_s(i32 inreg %sel) {
 ; GFX11-NEXT:    s_movrels_b64 s[0:1], s[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>, i32 %sel
@@ -627,8 +625,6 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, s16, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, s17, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i64> %vec, i32 %sel
@@ -745,8 +741,6 @@ define amdgpu_ps void @dyn_extract_v8i64_v_s(<8 x i64> %vec, i32 inreg %sel) {
 ; GFX11-NEXT:    v_movrels_b32_e32 v16, v0
 ; GFX11-NEXT:    v_movrels_b32_e32 v17, v1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[16:17], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i64> %vec, i32 %sel
@@ -852,8 +846,6 @@ define amdgpu_ps void @dyn_extract_v8i64_s_s(<8 x i64> inreg %vec, i32 inreg %se
 ; GFX11-NEXT:    s_movrels_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i64> %vec, i32 %sel
@@ -1805,8 +1797,6 @@ define amdgpu_ps void @dyn_extract_v8p1_s_s(<8 x ptr addrspace(1)> inreg %vec, i
 ; GFX11-NEXT:    s_movrels_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x ptr addrspace(1)> %vec, i32 %idx
@@ -3379,8 +3369,6 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <5 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0>, i32 %sel
@@ -4369,8 +4357,6 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:    s_cselect_b32 s2, 4.0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %sel
@@ -4727,8 +4713,6 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 6e2e88f22600a8..097640312322d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -3699,8 +3699,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: sgpr_base_large_offset:
@@ -3708,8 +3706,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; UNALIGNED_GFX9-LABEL: sgpr_base_large_offset:
@@ -3748,8 +3744,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v2, off, s0
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; UNALIGNED_GFX11-NEXT:    s_nop 0
-; UNALIGNED_GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; UNALIGNED_GFX11-NEXT:    s_endpgm
 ;
 ; UNALIGNED_GFX12-LABEL: sgpr_base_large_offset:
@@ -3757,8 +3751,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; UNALIGNED_GFX12-NEXT:    s_nop 0
-; UNALIGNED_GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 entry:
   %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
@@ -3809,8 +3801,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: sgpr_base_large_offset_split:
@@ -3821,8 +3811,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; UNALIGNED_GFX9-LABEL: sgpr_base_large_offset_split:
@@ -3866,8 +3854,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v2, off, s0 glc dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; UNALIGNED_GFX11-NEXT:    s_nop 0
-; UNALIGNED_GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; UNALIGNED_GFX11-NEXT:    s_endpgm
 ;
 ; UNALIGNED_GFX12-LABEL: sgpr_base_large_offset_split:
@@ -3878,8 +3864,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v2, off, s0 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; UNALIGNED_GFX12-NEXT:    s_nop 0
-; UNALIGNED_GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 entry:
   ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
@@ -4041,8 +4025,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: sgpr_base_negative_offset:
@@ -4050,8 +4032,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; UNALIGNED_GFX9-LABEL: sgpr_base_negative_offset:
@@ -4088,8 +4068,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; UNALIGNED_GFX11-NEXT:    s_nop 0
-; UNALIGNED_GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; UNALIGNED_GFX11-NEXT:    s_endpgm
 ;
 ; UNALIGNED_GFX12-LABEL: sgpr_base_negative_offset:
@@ -4097,8 +4075,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; UNALIGNED_GFX12-NEXT:    s_nop 0
-; UNALIGNED_GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 entry:
   %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 676a02aaf8a19c..4ae98ff1edf6c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -90,8 +90,6 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -184,8 +182,6 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v2, s1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(1 ) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -272,8 +268,6 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v2, s1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -367,8 +361,6 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, s0, v3, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -459,8 +451,6 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v2, s0, v2, v3
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -556,8 +546,6 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, v4, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -649,8 +637,6 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, s0, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -743,8 +729,6 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v2, v4, v2, v3
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
@@ -925,8 +909,6 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(1 ) %ptr
   %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
@@ -1058,8 +1040,6 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
@@ -1200,8 +1180,6 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
@@ -1339,8 +1317,6 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
@@ -1467,8 +1443,6 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
@@ -1590,8 +1564,6 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
@@ -1716,8 +1688,6 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
@@ -1900,8 +1870,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2057,8 +2025,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(1 ) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2240,8 +2206,6 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2428,8 +2392,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2616,8 +2578,6 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2774,8 +2734,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
 ; GFX11-NEXT:    global_store_b128 v[7:8], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -2928,8 +2886,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
 ; GFX11-NEXT:    global_store_b128 v[7:8], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -3083,8 +3039,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
@@ -3267,8 +3221,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@@ -3423,8 +3375,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[6:9], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(1 ) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@@ -3605,8 +3555,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@@ -3900,8 +3848,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@@ -4195,8 +4141,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@@ -4443,8 +4387,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[11:12], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[13:14], v[4:7], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@@ -4596,8 +4538,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[3:6], off
 ; GFX11-NEXT:    global_store_b128 v[11:12], v[7:10], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
@@ -4841,8 +4781,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[12:13], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i16>, ptr addrspace(1) %ptr
   %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index d5bfb7faf7fc8a..d4b9bc6d2e3c1d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -105,8 +105,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -209,8 +207,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(1 ) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -314,8 +310,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -423,8 +417,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -528,8 +520,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -632,8 +622,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -733,8 +721,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -834,8 +820,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
@@ -985,8 +969,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v2, s1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(1 ) %ptr
   %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
@@ -1073,8 +1055,6 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v2, s1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
@@ -1168,8 +1148,6 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, s0, v3, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
@@ -1260,8 +1238,6 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v2, s0, v2, v3
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
@@ -1357,8 +1333,6 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, v4, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
@@ -1450,8 +1424,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, v3, s0, v2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
@@ -1544,8 +1516,6 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v2, v4, v2, v3
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
@@ -1679,8 +1649,6 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -1804,8 +1772,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(1 ) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -1937,8 +1903,6 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2079,8 +2043,6 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2218,8 +2180,6 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2346,8 +2306,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2469,8 +2427,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2595,8 +2551,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
@@ -2779,8 +2733,6 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
@@ -2936,8 +2888,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(1 ) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
@@ -3119,8 +3069,6 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
@@ -3307,8 +3255,6 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
@@ -3495,8 +3441,6 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(4) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
@@ -3653,8 +3597,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
 ; GFX11-NEXT:    global_store_b128 v[7:8], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
@@ -3807,8 +3749,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
 ; GFX11-NEXT:    global_store_b128 v[7:8], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
@@ -3962,8 +3902,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <16 x i8>, ptr addrspace(1) %ptr
   %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 5185f6c4ada5ba..8ff1e1d8d072d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -148,8 +148,6 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr
 ; GFX11-NEXT:    global_store_b128 v64, v[56:59], s[2:3] offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v64, v[60:63], s[2:3] offset:240
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <64 x i32>, ptr addrspace(1) %ptr.in, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 49d9ad1c0f7974..df1afdf77983cc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -1078,8 +1078,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[13:16], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@@ -1228,8 +1226,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@@ -1290,8 +1286,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@@ -1497,8 +1491,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[15:18], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@@ -1622,8 +1614,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@@ -1684,8 +1674,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@@ -1803,8 +1791,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %insert = insertelement <8 x double> %vec, double %val, i32 %idx
@@ -2411,8 +2397,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %idx.add = add i32 %idx, 1
@@ -2537,8 +2521,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %idx.add = add i32 %idx, 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 2edcf23df411df..927a31d3992b06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -478,8 +478,6 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
 ; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %cmp = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index f88125ea02937f..19ccb476a0a07f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -74,8 +74,6 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, v0, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -159,8 +157,6 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, v1, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -250,8 +246,6 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -341,8 +335,6 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -421,8 +413,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, v0, v0, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -499,8 +489,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, s0, v0, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -577,8 +565,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, s0, s0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -655,8 +641,6 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, v0, s0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -734,8 +718,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -813,8 +795,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -892,8 +872,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -971,8 +949,6 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -1035,8 +1011,6 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, s5, s5, s4
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1094,8 +1068,6 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, s4, s5, s4
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1157,8 +1129,6 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
   %result0 = extractvalue { double, i1 } %result, 0
@@ -1220,8 +1190,6 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
   %result0 = extractvalue { double, i1 } %result, 0
@@ -1287,8 +1255,6 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, v0, v0, 1.0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1358,8 +1324,6 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, 2.0, 2.0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1445,8 +1409,6 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, v0, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1537,8 +1499,6 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, v0, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1594,8 +1554,6 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, s0, s0, 0x41000000
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1642,8 +1600,6 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1688,8 +1644,6 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v0, null, s0, s0, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
   %result0 = extractvalue { float, i1 } %result, 0
@@ -1738,8 +1692,6 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
   %result0 = extractvalue { double, i1 } %result, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 6415e185446f53..287546750d878d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -40,8 +40,6 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %arg0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 59818b0b1bc39b..4a9594ad45e15c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -154,8 +154,6 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
@@ -166,8 +164,6 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
 ; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
index 81c73c789239db..572894af885174 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@@ -33,8 +33,6 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %arg0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
index 9e445d034edc95..c1c383eb583aa7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
@@ -56,8 +56,6 @@ define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, fl
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_f32:
@@ -71,8 +69,6 @@ define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, fl
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v2, [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.f32.i32(float %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -129,8 +125,6 @@ define amdgpu_ps void @image_store_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v2f32:
@@ -144,8 +138,6 @@ define amdgpu_ps void @image_store_v2f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:3], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v2f32.i32(<2 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -202,8 +194,6 @@ define amdgpu_ps void @image_store_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:4], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v3f32:
@@ -217,8 +207,6 @@ define amdgpu_ps void @image_store_v3f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:4], [v0, v1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v3f32.i32(<3 x float> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -275,8 +263,6 @@ define amdgpu_ps void @image_store_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v4f32:
@@ -290,8 +276,6 @@ define amdgpu_ps void @image_store_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -348,8 +332,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0001(<8 x i32> inreg %rsrc, i32 %
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v4f32_dmask_0001:
@@ -363,8 +345,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0001(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -421,8 +401,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0010(<8 x i32> inreg %rsrc, i32 %
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v4f32_dmask_0010:
@@ -436,8 +414,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0010(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 2, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -494,8 +470,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0100(<8 x i32> inreg %rsrc, i32 %
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v4f32_dmask_0100:
@@ -509,8 +483,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0100(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 4, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -567,8 +539,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v4f32_dmask_1000:
@@ -582,8 +552,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 8, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -640,8 +608,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0011(<8 x i32> inreg %rsrc, i32 %
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v4f32_dmask_0011:
@@ -655,8 +621,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0011(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -713,8 +677,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 %
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_v4f32_dmask_0110:
@@ -728,8 +690,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 %
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 6, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -793,8 +753,6 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr
 ; GFX11-NEXT:    s_mov_b32 s6, s8
 ; GFX11-NEXT:    s_mov_b32 s7, s9
 ; GFX11-NEXT:    image_store v0, v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_f32_dmask_1111:
@@ -809,8 +767,6 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
 ; GFX12-NEXT:    image_store v0, [v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   tail call void @llvm.amdgcn.image.store.2d.f32.i32(float %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index 4946d3759c2ef2..3b402f919f3430 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -40,8 +40,6 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01]
 ; GFX11-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0
   store i32 %tmp0, ptr addrspace(1) %out
@@ -82,8 +80,6 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
 ; GFX11-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
 ; GFX11-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
   %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0
   store i64 %tmp0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 1e8209bd3fc6b9..4b0f2ef77a9834 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -36,8 +36,6 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
 ; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
   store i32 %tmp0, ptr addrspace(1) %out
@@ -88,8 +86,6 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
@@ -144,8 +140,6 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
@@ -200,8 +194,6 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
@@ -256,8 +248,6 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
@@ -312,8 +302,6 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
 ; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index cc0e34be02a765..603eb88c07afb4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -19,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
@@ -37,8 +35,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
@@ -55,8 +51,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
@@ -71,8 +65,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
@@ -91,8 +83,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
@@ -118,8 +108,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
@@ -138,8 +126,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
@@ -154,8 +140,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
@@ -174,8 +158,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
@@ -201,8 +183,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
@@ -221,8 +201,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -237,8 +215,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -253,8 +229,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -269,8 +243,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -285,8 +257,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -301,8 +271,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -317,8 +285,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -333,8 +299,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -351,8 +315,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -367,8 +329,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -383,8 +343,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -399,8 +357,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -416,8 +372,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -432,8 +386,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -448,8 +400,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -464,8 +414,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index 112a7d98ce7ec1..7deaca4ca78b43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -17,8 +17,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
@@ -33,8 +31,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
@@ -49,8 +45,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
@@ -63,8 +57,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
@@ -79,8 +71,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
@@ -102,8 +92,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
@@ -120,8 +108,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
@@ -134,8 +120,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
@@ -150,8 +134,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
@@ -173,8 +155,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
@@ -191,8 +171,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -206,8 +184,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -220,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -234,8 +208,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -248,8 +220,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -262,8 +232,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -276,8 +244,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -290,8 +256,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -306,8 +270,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -320,8 +282,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -334,8 +294,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -348,8 +306,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -362,8 +318,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -376,8 +330,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -390,8 +342,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -404,8 +354,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index c595c939e8d139..cea848e72ce0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -325,8 +325,6 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[2:3] offset:16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load <8 x i32>, ptr addrspace(4) %ptr, align 1
   store <8 x i32> %load, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 55ff6410c23508..2c71366772fc97 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -31,8 +31,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr(ptr addrspace(1) inreg %ptr) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_store_b32 v0, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   store i32 0, ptr addrspace(1) %ptr
   ret void
@@ -65,8 +63,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_store_b32 v0, v0, s[2:3] offset:16380
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   store i32 0, ptr addrspace(1) %gep
@@ -105,8 +101,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   store i32 0, ptr addrspace(1) %gep
@@ -145,8 +139,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) in
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297
   store i32 0, ptr addrspace(1) %gep
@@ -180,8 +172,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4096(ptr addrspace(1) inreg %p
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    global_store_b32 v0, v0, s[2:3] offset:16384
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4096
   store i32 0, ptr addrspace(1) %gep
@@ -213,8 +203,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(ptr addrspace(1) %ptr) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off offset:16380
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
   store i32 0, ptr addrspace(1) %gep
@@ -248,8 +236,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
   store i32 0, ptr addrspace(1) %gep
@@ -283,8 +269,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297
   store i32 0, ptr addrspace(1) %gep
@@ -316,8 +300,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(ptr addrspace(1) %ptr) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off offset:16384
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4096
   store i32 0, ptr addrspace(1) %gep
@@ -362,8 +344,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
 ; GFX12-NEXT:    s_add_co_u32 s0, s2, s0
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s3, s1
 ; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset
   store i32 0, ptr addrspace(1) %gep
@@ -402,8 +382,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset
   store i32 0, ptr addrspace(1) %gep
@@ -442,8 +420,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(ptr addrspace(
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off offset:1024
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset
   %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 256
@@ -483,8 +459,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(ptr addrspace(
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off offset:1024
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 256
   %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 %soffset
@@ -528,8 +502,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg %
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   store i32 0, ptr addrspace(1) %gep
@@ -574,8 +546,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off offset:16380
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
   %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 4095
@@ -620,8 +590,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off offset:16380
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 4095
   %gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 %voffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 489f46d1237a36..b3b7457da64d1a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -39,8 +39,6 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX11-NEXT:    global_store_b64 v9, v[4:5], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -93,8 +91,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -147,8 +143,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
@@ -195,8 +189,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
@@ -250,8 +242,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -300,8 +290,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -351,8 +339,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v0, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -384,8 +370,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -448,8 +432,6 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    global_store_b64 v0, v[4:5], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -481,8 +463,6 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -583,8 +563,6 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 42f1bf84c04207..8cc4b7759c3a33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2519,8 +2519,6 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_zext_with_vregs:
@@ -2529,8 +2527,6 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = zext i32 %val to i64
@@ -2613,8 +2609,6 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_mul_hi_u32 s3, s3, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_zext_with_sregs:
@@ -2629,8 +2623,6 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = zext i32 %val to i64
@@ -2695,8 +2687,6 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4]
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_sext_with_vregs:
@@ -2705,8 +2695,6 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = sext i32 %val to i64
@@ -2804,8 +2792,6 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_add_i32 s3, s4, s3
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_sext_with_sregs:
@@ -2820,8 +2806,6 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %ext = sext i32 %val to i64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 183f2edbf9035b..adbe92fdbc6252 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -305,8 +305,6 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[2:3], v1, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -404,8 +402,6 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add
 ; GFX11-NEXT:    v_mul_i32_i24_e32 v0, -7, v0
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index e500aae7e0f3c0..9cf9839e69d5d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -25,8 +23,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -42,8 +38,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -59,8 +53,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -76,8 +68,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -93,8 +83,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -123,8 +109,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -138,8 +122,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x half> %C
@@ -153,8 +135,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
@@ -170,8 +150,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -187,8 +165,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -204,8 +180,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -221,8 +195,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -238,8 +210,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -255,8 +225,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -272,8 +240,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -289,8 +255,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -306,8 +270,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -323,8 +285,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
@@ -338,8 +298,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -353,8 +311,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
@@ -372,8 +328,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -388,8 +342,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
@@ -408,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <8 x float> %C, i32 3
@@ -430,8 +380,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -445,8 +393,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -477,8 +423,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %C = load <16 x half>, ptr %Caddr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
index 3037c1ec2829e5..4959e10d2a18db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -38,8 +36,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -54,8 +50,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -84,8 +78,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -98,8 +90,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -120,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
@@ -142,8 +130,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
@@ -164,8 +150,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
@@ -180,8 +164,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -210,8 +192,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -226,8 +206,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -256,8 +234,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -272,8 +248,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -302,8 +276,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -318,8 +290,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -348,8 +318,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -364,8 +332,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -394,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -410,8 +374,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -440,8 +402,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -456,8 +416,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -486,8 +444,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
index 086144873a042f..22c61f99262221 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -40,8 +36,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -58,8 +52,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
@@ -74,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
@@ -90,8 +80,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
@@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -124,8 +110,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -140,8 +124,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -160,8 +142,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -176,8 +156,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -192,8 +170,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
@@ -210,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -226,8 +200,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -242,8 +214,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
@@ -260,8 +230,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
@@ -276,8 +244,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
@@ -292,8 +258,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
index a6e1f5ef12b4bb..7eafe53ea84cf0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -19,8 +19,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -51,8 +49,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -77,8 +73,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
 ; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -103,8 +97,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1
 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
 ; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -135,8 +127,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -167,8 +157,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -199,8 +187,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -231,8 +217,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -263,8 +247,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -295,8 +277,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
index 3aa81da317d67d..80497115e41b24 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
@@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
@@ -38,8 +34,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
@@ -52,8 +46,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
@@ -68,8 +60,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -84,8 +74,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
@@ -100,8 +88,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -116,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -132,8 +116,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -148,8 +130,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -164,8 +144,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -181,8 +159,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
@@ -197,8 +173,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
@@ -211,8 +185,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
@@ -225,8 +197,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
@@ -241,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -257,8 +225,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -273,8 +239,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
@@ -289,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
@@ -305,8 +267,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
@@ -321,8 +281,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
@@ -337,8 +295,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 08a5d6660bab2c..be1761227f8022 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -21,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
@@ -36,8 +32,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -51,8 +45,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -66,8 +58,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -81,8 +71,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -96,8 +84,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -111,8 +97,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
@@ -126,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x half> %C
@@ -141,8 +123,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
@@ -156,8 +136,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -171,8 +149,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -186,8 +162,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -201,8 +175,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -216,8 +188,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -231,8 +201,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -246,8 +214,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -261,8 +227,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -276,8 +240,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -291,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -306,8 +266,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -321,8 +279,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -338,8 +294,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -354,8 +308,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
@@ -372,8 +324,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <4 x float> %C, i32 3
@@ -392,8 +342,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -407,8 +355,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
@@ -432,8 +378,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %C = load <8 x half>, ptr %Caddr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
index 738671cf7c6482..173dd011f4d67f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -30,8 +28,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -44,8 +40,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -68,8 +62,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -82,8 +74,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -102,8 +92,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
@@ -122,8 +110,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
@@ -142,8 +128,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16>
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
@@ -156,8 +140,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrsp
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -180,8 +162,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -194,8 +174,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -218,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -232,8 +208,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -256,8 +230,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -270,8 +242,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -294,8 +264,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -308,8 +276,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -332,8 +298,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -346,8 +310,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -370,8 +332,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -384,8 +344,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -408,8 +366,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
index 25f4145f9724c4..83bbf56daec20e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
@@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
@@ -50,8 +44,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -64,8 +56,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
@@ -78,8 +68,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
@@ -94,8 +82,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -108,8 +94,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
@@ -122,8 +106,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
@@ -141,8 +123,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
@@ -155,8 +135,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
@@ -169,8 +147,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
@@ -185,8 +161,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -199,8 +173,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -213,8 +185,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
@@ -229,8 +199,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -243,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -257,8 +223,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index 87fe8334d059f2..1e9ef07ba75423 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -27,8 +27,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8
 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -73,8 +71,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8
 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -113,8 +109,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8
 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -153,8 +147,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8
 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -199,8 +191,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32>
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -233,8 +223,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -261,8 +249,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32>
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -301,8 +287,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -347,8 +331,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -393,8 +375,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -439,8 +419,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
index a03180f0395264..f01679fb45d90d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
@@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
@@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
@@ -48,8 +42,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
@@ -62,8 +54,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -76,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -90,8 +78,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
@@ -104,8 +90,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
@@ -118,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
@@ -132,8 +114,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
@@ -146,8 +126,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -160,8 +138,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
@@ -174,8 +150,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
@@ -188,8 +162,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
@@ -202,8 +174,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
@@ -216,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
@@ -230,8 +198,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -244,8 +210,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -258,8 +222,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
@@ -272,8 +234,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
@@ -286,8 +246,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
@@ -300,8 +258,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 94d704fa3f92d6..3c9d43a88a0fda 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -67,8 +67,6 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_add_i32:
@@ -81,8 +79,6 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
   %a = load i32, ptr addrspace(1) %in
@@ -163,8 +159,6 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_add_v2i32:
@@ -179,8 +173,6 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
   %a = load <2 x i32>, ptr addrspace(1) %in
@@ -280,8 +272,6 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[8:9]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_add_v4i32:
@@ -299,8 +289,6 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
   %a = load <4 x i32>, ptr addrspace(1) %in
@@ -446,8 +434,6 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_add_v8i32:
@@ -472,8 +458,6 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %0 = add <8 x i32> %a, %b
@@ -719,8 +703,6 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
 ; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_add_v16i32:
@@ -760,8 +742,6 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %0 = add <16 x i32> %a, %b
@@ -853,8 +833,6 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_add_i32:
@@ -871,8 +849,6 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
@@ -955,8 +931,6 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_add_imm_i32:
@@ -970,8 +944,6 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
@@ -1051,8 +1023,6 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: add64:
@@ -1065,8 +1035,6 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %add = add i64 %a, %b
@@ -1157,8 +1125,6 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: add64_sgpr_vgpr:
@@ -1173,8 +1139,6 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %0 = load i64, ptr addrspace(1) %in
@@ -1303,8 +1267,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB9_4:
 ; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -1326,8 +1288,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ; GFX12-NEXT:  .LBB9_4:
 ; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 4cc384e9d27188..b413e779dbaf19 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -78,8 +78,6 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -154,8 +152,6 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v1, s2, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load <2 x i16>, ptr addrspace(4) %in0
   %b = load <2 x i16>, ptr addrspace(4) %in1
@@ -215,8 +211,6 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v1, s2, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load <2 x i16>, ptr addrspace(4) %in0
   %add = add <2 x i16> %a, %a
@@ -269,8 +263,6 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v1, s2, s3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <2 x i16> %a, %b
   store <2 x i16> %add, ptr addrspace(1) %out
@@ -334,8 +326,6 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v0, 0x1c8007b, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -403,8 +393,6 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v0, 0xfc21fcb3, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -470,8 +458,6 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v0, v0, -1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -536,8 +522,6 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v0, v0, 32
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -603,8 +587,6 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1.0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -694,8 +676,6 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
@@ -792,8 +772,6 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_alignbit_b32 v2, 0, v0, 16
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
 ; GFX11-NEXT:    global_store_b128 v1, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
@@ -888,8 +866,6 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
@@ -994,8 +970,6 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 5cf9c9faa693ed..ad0babd74f9c5e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -185,8 +185,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_constant:
@@ -216,8 +214,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_constant:
@@ -250,8 +246,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_constant:
@@ -284,8 +278,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -471,8 +463,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_uniform:
@@ -503,8 +493,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_uniform:
@@ -538,8 +526,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_uniform:
@@ -573,8 +559,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -831,8 +815,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
@@ -875,8 +857,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_varying_vdata:
@@ -923,8 +903,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_varying_vdata:
@@ -971,8 +949,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1245,8 +1221,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: struct_add_i32_varying_vdata:
@@ -1291,8 +1265,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: struct_add_i32_varying_vdata:
@@ -1342,8 +1314,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: struct_add_i32_varying_vdata:
@@ -1392,8 +1362,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1466,8 +1434,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_offset:
@@ -1481,8 +1447,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_varying_offset:
@@ -1497,8 +1461,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_varying_offset:
@@ -1512,8 +1474,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1697,8 +1657,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_constant:
@@ -1729,8 +1687,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_constant:
@@ -1764,8 +1720,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_constant:
@@ -1799,8 +1753,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -1987,8 +1939,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_uniform:
@@ -2020,8 +1970,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_uniform:
@@ -2056,8 +2004,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_uniform:
@@ -2091,8 +2037,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -2349,8 +2293,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
@@ -2394,8 +2336,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_varying_vdata:
@@ -2442,8 +2382,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_varying_vdata:
@@ -2491,8 +2429,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2565,8 +2501,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_offset:
@@ -2580,8 +2514,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_varying_offset:
@@ -2596,8 +2528,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_varying_offset:
@@ -2611,8 +2541,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index d24eed841a9af1..495bfec5454ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -219,8 +219,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_constant:
@@ -255,8 +253,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: add_i32_constant:
@@ -294,8 +290,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: add_i32_constant:
@@ -333,8 +327,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
@@ -551,8 +543,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
 ; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_uniform:
@@ -589,8 +579,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
 ; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: add_i32_uniform:
@@ -628,8 +616,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
 ; GFX1264-NEXT:    buffer_store_b32 v0, off, s[4:7], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: add_i32_uniform:
@@ -666,8 +652,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3]
 ; GFX1232-NEXT:    buffer_store_b32 v0, off, s[4:7], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel
@@ -953,8 +937,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: add_i32_varying:
@@ -1004,8 +986,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1264_ITERATIVE-LABEL: add_i32_varying:
@@ -1057,8 +1037,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1264_ITERATIVE-NEXT:    s_nop 0
-; GFX1264_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1232_ITERATIVE-LABEL: add_i32_varying:
@@ -1111,8 +1089,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1232_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1232_ITERATIVE-NEXT:    s_nop 0
-; GFX1232_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: add_i32_varying:
@@ -1412,8 +1388,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1164_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: add_i32_varying:
@@ -1471,8 +1445,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 ;
 ; GFX1264_DPP-LABEL: add_i32_varying:
@@ -1545,8 +1517,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1264_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1264_DPP-NEXT:    s_nop 0
-; GFX1264_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_DPP-NEXT:    s_endpgm
 ;
 ; GFX1232_DPP-LABEL: add_i32_varying:
@@ -1605,8 +1575,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1232_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1232_DPP-NEXT:    s_nop 0
-; GFX1232_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1834,8 +1802,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i64_constant:
@@ -1871,8 +1837,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: add_i64_constant:
@@ -1912,8 +1876,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: add_i64_constant:
@@ -1952,8 +1914,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(1) %inout, i64 5  syncscope("agent") acq_rel
@@ -2214,8 +2174,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i64_uniform:
@@ -2260,8 +2218,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: add_i64_uniform:
@@ -2304,8 +2260,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
 ; GFX1264-NEXT:    v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
 ; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: add_i64_uniform:
@@ -2346,8 +2300,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
 ; GFX1232-NEXT:    v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
 ; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive  syncscope("agent") acq_rel
@@ -2679,8 +2631,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: add_i64_varying:
@@ -2736,8 +2686,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1264_ITERATIVE-LABEL: add_i64_varying:
@@ -2794,8 +2742,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1264_ITERATIVE-NEXT:    s_nop 0
-; GFX1264_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1232_ITERATIVE-LABEL: add_i64_varying:
@@ -2849,8 +2795,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1232_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1232_ITERATIVE-NEXT:    s_nop 0
-; GFX1232_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: add_i64_varying:
@@ -3352,8 +3296,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: add_i64_varying:
@@ -3442,8 +3384,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 ;
 ; GFX1264_DPP-LABEL: add_i64_varying:
@@ -3556,8 +3496,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s3, v11, vcc
 ; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
-; GFX1264_DPP-NEXT:    s_nop 0
-; GFX1264_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_DPP-NEXT:    s_endpgm
 ;
 ; GFX1232_DPP-LABEL: add_i64_varying:
@@ -3647,8 +3585,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
 ; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
-; GFX1232_DPP-NEXT:    s_nop 0
-; GFX1232_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3861,8 +3797,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i32_constant:
@@ -3898,8 +3832,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: sub_i32_constant:
@@ -3938,8 +3870,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: sub_i32_constant:
@@ -3978,8 +3908,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
@@ -4199,8 +4127,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i32_uniform:
@@ -4238,8 +4164,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: sub_i32_uniform:
@@ -4278,8 +4202,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1264-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX1264-NEXT:    buffer_store_b32 v0, off, s[4:7], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: sub_i32_uniform:
@@ -4317,8 +4239,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX1232-NEXT:    buffer_store_b32 v0, off, s[4:7], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel
@@ -4604,8 +4524,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: sub_i32_varying:
@@ -4655,8 +4573,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1264_ITERATIVE-LABEL: sub_i32_varying:
@@ -4708,8 +4624,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1264_ITERATIVE-NEXT:    s_nop 0
-; GFX1264_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1232_ITERATIVE-LABEL: sub_i32_varying:
@@ -4762,8 +4676,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1232_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1232_ITERATIVE-NEXT:    s_nop 0
-; GFX1232_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: sub_i32_varying:
@@ -5063,8 +4975,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1164_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: sub_i32_varying:
@@ -5122,8 +5032,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1132_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 ;
 ; GFX1264_DPP-LABEL: sub_i32_varying:
@@ -5196,8 +5104,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1264_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1264_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1264_DPP-NEXT:    s_nop 0
-; GFX1264_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_DPP-NEXT:    s_endpgm
 ;
 ; GFX1232_DPP-LABEL: sub_i32_varying:
@@ -5256,8 +5162,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1232_DPP-NEXT:    s_mov_b32 s2, s6
 ; GFX1232_DPP-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX1232_DPP-NEXT:    s_nop 0
-; GFX1232_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5497,8 +5401,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i64_constant:
@@ -5537,8 +5439,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: sub_i64_constant:
@@ -5581,8 +5481,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s2, -1
 ; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: sub_i64_constant:
@@ -5624,8 +5522,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s2, -1
 ; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel
@@ -5897,8 +5793,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i64_uniform:
@@ -5945,8 +5839,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX1264-LABEL: sub_i64_uniform:
@@ -5993,8 +5885,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1264-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
 ; GFX1264-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX1264-NEXT:    s_nop 0
-; GFX1264-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: sub_i64_uniform:
@@ -6039,8 +5929,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1232-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
 ; GFX1232-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX1232-NEXT:    s_nop 0
-; GFX1232-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel
@@ -6372,8 +6260,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
@@ -6429,8 +6315,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1264_ITERATIVE-LABEL: sub_i64_varying:
@@ -6487,8 +6371,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1264_ITERATIVE-NEXT:    s_nop 0
-; GFX1264_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1232_ITERATIVE-LABEL: sub_i64_varying:
@@ -6542,8 +6424,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1232_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX1232_ITERATIVE-NEXT:    s_nop 0
-; GFX1232_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: sub_i64_varying:
@@ -7045,8 +6925,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc
 ; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: sub_i64_varying:
@@ -7135,8 +7013,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 ;
 ; GFX1264_DPP-LABEL: sub_i64_varying:
@@ -7249,8 +7125,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s3, v11, vcc
 ; GFX1264_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
-; GFX1264_DPP-NEXT:    s_nop 0
-; GFX1264_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1264_DPP-NEXT:    s_endpgm
 ;
 ; GFX1232_DPP-LABEL: sub_i64_varying:
@@ -7340,8 +7214,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
 ; GFX1232_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1232_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], null
-; GFX1232_DPP-NEXT:    s_nop 0
-; GFX1232_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1232_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index ce90fbed813103..5924649ebe7350 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -193,8 +193,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_constant:
@@ -224,8 +222,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(3) @local_var32, i32 5 acq_rel
@@ -420,8 +416,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_uniform:
@@ -453,8 +447,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
 ; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %additive acq_rel
@@ -713,8 +705,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: add_i32_varying:
@@ -758,8 +748,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: add_i32_varying:
@@ -1017,8 +1005,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: add_i32_varying:
@@ -1067,8 +1053,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1674,8 +1658,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i64_constant:
@@ -1707,8 +1689,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(3) @local_var64, i64 5 acq_rel
@@ -1944,8 +1924,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i64_uniform:
@@ -1984,8 +1962,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel
@@ -2290,8 +2266,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: add_i64_varying:
@@ -2342,8 +2316,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: add_i64_varying:
@@ -2814,8 +2786,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: add_i64_varying:
@@ -2897,8 +2867,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3698,8 +3666,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i32_constant:
@@ -3730,8 +3696,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 5 acq_rel
@@ -3927,8 +3891,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
 ; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i32_uniform:
@@ -3961,8 +3923,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %subitive acq_rel
@@ -4221,8 +4181,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: sub_i32_varying:
@@ -4266,8 +4224,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: sub_i32_varying:
@@ -4525,8 +4481,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: sub_i32_varying:
@@ -4575,8 +4529,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5193,8 +5145,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i64_constant:
@@ -5229,8 +5179,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel
@@ -5476,8 +5424,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i64_uniform:
@@ -5518,8 +5464,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel
@@ -5824,8 +5768,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
@@ -5876,8 +5818,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: sub_i64_varying:
@@ -6348,8 +6288,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: sub_i64_varying:
@@ -6431,8 +6369,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -6693,8 +6629,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: and_i32_varying:
@@ -6738,8 +6672,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: and_i32_varying:
@@ -6997,8 +6929,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: and_i32_varying:
@@ -7047,8 +6977,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -7343,8 +7271,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: and_i64_varying:
@@ -7392,8 +7318,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: and_i64_varying:
@@ -7739,8 +7663,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: and_i64_varying:
@@ -7808,8 +7730,6 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_and_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -8070,8 +7990,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: or_i32_varying:
@@ -8115,8 +8033,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: or_i32_varying:
@@ -8374,8 +8290,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: or_i32_varying:
@@ -8424,8 +8338,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -8720,8 +8632,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: or_i64_varying:
@@ -8769,8 +8679,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: or_i64_varying:
@@ -9118,8 +9026,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: or_i64_varying:
@@ -9187,8 +9093,6 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_or_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -9449,8 +9353,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: xor_i32_varying:
@@ -9494,8 +9396,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: xor_i32_varying:
@@ -9753,8 +9653,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: xor_i32_varying:
@@ -9803,8 +9701,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -10099,8 +9995,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: xor_i64_varying:
@@ -10148,8 +10042,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: xor_i64_varying:
@@ -10497,8 +10389,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: xor_i64_varying:
@@ -10566,8 +10456,6 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_xor_b32_e32 v8, s1, v8
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[8:9], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -10828,8 +10716,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: max_i32_varying:
@@ -10873,8 +10759,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: max_i32_varying:
@@ -11132,8 +11016,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_max_i32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: max_i32_varying:
@@ -11182,8 +11064,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_max_i32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -11390,8 +11270,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: max_i64_constant:
@@ -11423,8 +11301,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel
@@ -11763,8 +11639,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: max_i64_varying:
@@ -11820,8 +11694,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: max_i64_varying:
@@ -12346,8 +12218,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: max_i64_varying:
@@ -12437,8 +12307,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -12699,8 +12567,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: min_i32_varying:
@@ -12744,8 +12610,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: min_i32_varying:
@@ -13003,8 +12867,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_min_i32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: min_i32_varying:
@@ -13053,8 +12915,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_min_i32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -13261,8 +13121,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: min_i64_constant:
@@ -13294,8 +13152,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel
@@ -13634,8 +13490,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: min_i64_varying:
@@ -13691,8 +13545,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: min_i64_varying:
@@ -14210,8 +14062,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: min_i64_varying:
@@ -14300,8 +14150,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -14562,8 +14410,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: umax_i32_varying:
@@ -14607,8 +14453,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: umax_i32_varying:
@@ -14866,8 +14710,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_max_u32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: umax_i32_varying:
@@ -14916,8 +14758,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_max_u32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -15121,8 +14961,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: umax_i64_constant:
@@ -15154,8 +14992,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw umax ptr addrspace(3) @local_var64, i64 5 acq_rel
@@ -15488,8 +15324,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: umax_i64_varying:
@@ -15544,8 +15378,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: umax_i64_varying:
@@ -16056,8 +15888,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: umax_i64_varying:
@@ -16146,8 +15976,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -16408,8 +16236,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: umin_i32_varying:
@@ -16453,8 +16279,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: umin_i32_varying:
@@ -16712,8 +16536,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_min_u32_e32 v0, s0, v0
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: umin_i32_varying:
@@ -16762,8 +16584,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_min_u32_e32 v0, s0, v0
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -16967,8 +16787,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: umin_i64_constant:
@@ -17000,8 +16818,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %old = atomicrmw umin ptr addrspace(3) @local_var64, i64 5 acq_rel
@@ -17334,8 +17150,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1164_ITERATIVE-NEXT:    s_nop 0
-; GFX1164_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX1132_ITERATIVE-LABEL: umin_i64_varying:
@@ -17390,8 +17204,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_ITERATIVE-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX1132_ITERATIVE-NEXT:    s_nop 0
-; GFX1132_ITERATIVE-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX7LESS_DPP-LABEL: umin_i64_varying:
@@ -17900,8 +17712,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1164_DPP-NEXT:    s_nop 0
-; GFX1164_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164_DPP-NEXT:    s_endpgm
 ;
 ; GFX1132_DPP-LABEL: umin_i64_varying:
@@ -17990,8 +17800,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v9, v9, s0, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[9:10], off, s[4:7], 0
-; GFX1132_DPP-NEXT:    s_nop 0
-; GFX1132_DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132_DPP-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 429e6c489bf6f8..4ae08a0375c8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -193,8 +193,6 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1164-NEXT:  ; %bb.5: ; %if
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_constant:
@@ -234,8 +232,6 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1132-NEXT:  ; %bb.5: ; %if
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
@@ -552,8 +548,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:  ; %bb.5: ; %if
 ; GFX1164-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
 ; GFX1164-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_varying:
@@ -612,8 +606,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:  ; %bb.5: ; %if
 ; GFX1132-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
 ; GFX1132-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 9d4dfd8911257a..a1f7d2ca3d3383 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -184,8 +184,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_constant:
@@ -215,8 +213,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_constant:
@@ -249,8 +245,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_constant:
@@ -283,8 +277,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -470,8 +462,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_uniform:
@@ -502,8 +492,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_uniform:
@@ -537,8 +525,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_uniform:
@@ -572,8 +558,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -830,8 +814,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
@@ -874,8 +856,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_varying_vdata:
@@ -922,8 +902,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_varying_vdata:
@@ -970,8 +948,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1044,8 +1020,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_offset:
@@ -1059,8 +1033,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_varying_offset:
@@ -1075,8 +1047,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_varying_offset:
@@ -1090,8 +1060,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1275,8 +1243,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_constant:
@@ -1307,8 +1273,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_constant:
@@ -1342,8 +1306,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_constant:
@@ -1377,8 +1339,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -1565,8 +1525,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_uniform:
@@ -1598,8 +1556,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_uniform:
@@ -1634,8 +1590,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_uniform:
@@ -1669,8 +1623,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -1927,8 +1879,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
@@ -1972,8 +1922,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_varying_vdata:
@@ -2020,8 +1968,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_varying_vdata:
@@ -2069,8 +2015,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2143,8 +2087,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_offset:
@@ -2158,8 +2100,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_varying_offset:
@@ -2174,8 +2114,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_varying_offset:
@@ -2189,8 +2127,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 3fb44e090c61f2..d0987068841b9b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -190,8 +190,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_constant:
@@ -222,8 +220,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_constant:
@@ -257,8 +253,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_constant:
@@ -291,8 +285,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
@@ -484,8 +476,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_uniform:
@@ -517,8 +507,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_uniform:
@@ -553,8 +541,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
 ; GFX12W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_uniform:
@@ -588,8 +574,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
 ; GFX12W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 %additive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
@@ -852,8 +836,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
@@ -896,8 +878,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_varying_vdata:
@@ -945,8 +925,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_varying_vdata:
@@ -993,8 +971,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1067,8 +1043,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_vindex:
@@ -1082,8 +1056,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_varying_vindex:
@@ -1098,8 +1070,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_varying_vindex:
@@ -1113,8 +1083,6 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1199,8 +1167,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_offset:
@@ -1216,8 +1182,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: add_i32_varying_offset:
@@ -1232,8 +1196,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: add_i32_varying_offset:
@@ -1247,8 +1209,6 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1438,8 +1398,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_constant:
@@ -1471,8 +1429,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_constant:
@@ -1507,8 +1463,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_constant:
@@ -1542,8 +1496,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
@@ -1736,8 +1688,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_uniform:
@@ -1770,8 +1720,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_uniform:
@@ -1807,8 +1755,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_uniform:
@@ -1842,8 +1788,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 %subitive, ptr addrspace(8) %inout, i32 0, i32 0, i32 0, i32 0)
@@ -2106,8 +2050,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
@@ -2151,8 +2093,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_varying_vdata:
@@ -2200,8 +2140,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W64-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_varying_vdata:
@@ -2249,8 +2187,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX12W32-NEXT:    s_wait_kmcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2323,8 +2259,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vindex:
@@ -2338,8 +2272,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_varying_vindex:
@@ -2354,8 +2286,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_varying_vindex:
@@ -2369,8 +2299,6 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2455,8 +2383,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W64-NEXT:    s_nop 0
-; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_offset:
@@ -2472,8 +2398,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W32-NEXT:    s_nop 0
-; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 ;
 ; GFX12W64-LABEL: sub_i32_varying_offset:
@@ -2488,8 +2412,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
 ; GFX12W64-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W64-NEXT:    s_nop 0
-; GFX12W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W64-NEXT:    s_endpgm
 ;
 ; GFX12W32-LABEL: sub_i32_varying_offset:
@@ -2503,8 +2425,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX12W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
 ; GFX12W32-NEXT:    s_wait_loadcnt 0x0
 ; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W32-NEXT:    s_nop 0
-; GFX12W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12W32-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 417d38990505b6..c630effa4b048a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -118,8 +118,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
@@ -128,8 +126,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
@@ -148,8 +144,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr
 ; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
@@ -162,8 +156,6 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr
 ; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 6f52da2631b8a6..b281c1bf3f9c48 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -69,8 +69,6 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
 ; GFX11-FLAT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_i16:
@@ -87,8 +85,6 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, ptr addrspace(1) %out
@@ -162,8 +158,6 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i16:
@@ -175,8 +169,6 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
@@ -233,8 +225,6 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
 ; GFX11-FLAT-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_i32:
@@ -248,8 +238,6 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, ptr addrspace(1) %out
@@ -322,8 +310,6 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i32:
@@ -337,8 +323,6 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -405,8 +389,6 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32>
 ; GFX11-FLAT-NEXT:    s_mov_b32 s4, s0
 ; GFX11-FLAT-NEXT:    s_mov_b32 s5, s1
 ; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_v2i32:
@@ -419,8 +401,6 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32>
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, ptr addrspace(1) %out
@@ -497,8 +477,6 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_v2i32:
@@ -514,8 +492,6 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -575,8 +551,6 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #
 ; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_i64:
@@ -588,8 +562,6 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, ptr addrspace(1) %out
@@ -666,8 +638,6 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-FLAT-NEXT:    buffer_store_b64 v[1:2], off, s[0:3], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_i64:
@@ -683,8 +653,6 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i64, ptr addrspace(1) %valptr, i32 %tid
@@ -756,8 +724,6 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
 ; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FLAT-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: s_brev_v2i64:
@@ -772,8 +738,6 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[8:9]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, ptr addrspace(1) %out
@@ -858,8 +822,6 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
 ; GFX11-FLAT-NEXT:    buffer_store_b128 v[1:4], off, s[0:3], 0
-; GFX11-FLAT-NEXT:    s_nop 0
-; GFX11-FLAT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_brev_v2i64:
@@ -877,8 +839,6 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    global_store_b128 v0, v[4:7], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x i64> , ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 857b13fab8a7ce..afff56c7f0eb23 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -82,13 +82,9 @@ define amdgpu_kernel void @br_cc_f16(
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GFX11-NEXT:  ; %bb.1: ; %one
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB0_2: ; %two
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -173,8 +169,6 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
 ; GFX11-NEXT:    s_mov_b32 s2, s6
 ; GFX11-NEXT:    s_mov_b32 s3, s7
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) {
@@ -259,8 +253,6 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
 ; GFX11-NEXT:    s_mov_b32 s2, s6
 ; GFX11-NEXT:    s_mov_b32 s3, s7
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index bb642afecc9cc5..77f1bc2a172a50 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -73,8 +73,6 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: uniform_conditional_max_short_forward_branch:
@@ -104,8 +102,6 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %cmp = icmp eq i32 %cnd, 0
@@ -183,8 +179,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: uniform_conditional_min_long_forward_branch:
@@ -214,8 +208,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb0:
   %cmp = icmp eq i32 %cnd, 0
@@ -298,8 +290,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
@@ -333,8 +323,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb0:
   %cmp = fcmp oeq float %cnd, 0.0
@@ -426,8 +414,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: min_long_forward_vbranch:
@@ -465,8 +451,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -646,16 +630,8 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], -1
-; GFX11-NEXT:    s_cbranch_scc0 .LBB5_1
-; GFX11-NEXT:  ; %bb.7: ; %bb0
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc6:
-; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc6)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc6)>>32
-; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_setpc_b64 s[0:1]
-; GFX11-NEXT:  .LBB5_1: ; %Flow
+; GFX11-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX11-NEXT:  ; %bb.1: ; %Flow
 ; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[0:1]
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB5_3
 ; GFX11-NEXT:  .LBB5_2: ; %bb2
@@ -669,8 +645,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB5_4: ; %bb3
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -680,12 +654,12 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GFX11-NEXT:    v_nop_e64
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_cbranch_execnz .LBB5_5
-; GFX11-NEXT:  ; %bb.9: ; %bb3
+; GFX11-NEXT:  ; %bb.7: ; %bb3
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc7:
+; GFX11-NEXT:  .Lpost_getpc6:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc7)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc7)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB5_5: ; %bb3
@@ -725,8 +699,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ; GFX12-NEXT:  .LBB5_4: ; %bb3
 ; GFX12-NEXT:    ;;#ASMSTART
@@ -814,10 +786,10 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad
 ; GFX11-NEXT:  ; %bb.3: ; %loop
 ; GFX11-NEXT:    ; in Loop: Header=BB6_1 Depth=1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc8:
+; GFX11-NEXT:  .Lpost_getpc7:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc8)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc8)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc7)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc7)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB6_2: ; %DummyReturnBlock
@@ -921,10 +893,10 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 ; GFX11-NEXT:    s_cbranch_vccz .LBB7_3
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc9:
+; GFX11-NEXT:  .Lpost_getpc8:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc9)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc9)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc8)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc8)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB7_3: ; %bb2
@@ -1067,8 +1039,6 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %
 ; GFX11-NEXT:  .LBB8_3: ; %endif
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    s_sleep 5
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: uniform_inside_divergent:
@@ -1091,8 +1061,6 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:  .LBB8_3: ; %endif
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_sleep 5
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1186,10 +1154,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GFX11-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX11-NEXT:  ; %bb.6: ; %Flow1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc10:
+; GFX11-NEXT:  .Lpost_getpc9:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc9)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc9)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB9_3: ; %loop.preheader
@@ -1211,10 +1179,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GFX11-NEXT:  ; %bb.8: ; %loop
 ; GFX11-NEXT:    ; in Loop: Header=BB9_4 Depth=1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc11:
+; GFX11-NEXT:  .Lpost_getpc10:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc10)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc10)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
@@ -1374,10 +1342,10 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX11-NEXT:  ; %bb.8: ; %bb
 ; GFX11-NEXT:    s_getpc_b64 s[8:9]
-; GFX11-NEXT:  .Lpost_getpc12:
+; GFX11-NEXT:  .Lpost_getpc11:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295
-; GFX11-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32
+; GFX11-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc11)&4294967295
+; GFX11-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc11)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[8:9]
 ; GFX11-NEXT:  .LBB10_1: ; %bb13
@@ -1426,8 +1394,6 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_add_u32 s0, s2, s0
 ; GFX11-NEXT:    s_addc_u32 s1, s3, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: long_branch_hang:
@@ -1537,8 +1503,6 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %tmp = icmp slt i32 %arg2, 9
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 321a7ceb826f6e..3c48a0f0dcabe3 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -57,8 +57,6 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX11-NEXT:    v_perm_b32 v0, 0, s2, 0x10203
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
@@ -112,8 +110,6 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    v_perm_b32 v1, 0, s5, 0x10203
 ; GFX11-NEXT:    v_perm_b32 v0, 0, s4, 0x10203
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x i32>, ptr addrspace(1) %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
@@ -177,8 +173,6 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    v_perm_b32 v1, 0, s5, 0x10203
 ; GFX11-NEXT:    v_perm_b32 v0, 0, s4, 0x10203
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <4 x i32>, ptr addrspace(1) %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
@@ -266,8 +260,6 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <8 x i32>, ptr addrspace(1) %in, align 32
   %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
@@ -321,8 +313,6 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX11-NEXT:    v_perm_b32 v1, 0, s4, 0x10203
 ; GFX11-NEXT:    v_perm_b32 v0, 0, s5, 0x10203
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
@@ -386,8 +376,6 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    v_perm_b32 v1, 0, s4, 0x10203
 ; GFX11-NEXT:    v_perm_b32 v0, 0, s5, 0x10203
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x i64>, ptr addrspace(1) %in, align 16
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
@@ -475,8 +463,6 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <4 x i64>, ptr addrspace(1) %in, align 32
   %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 13c4ff8b2ff302..dc9ce68a4a6836 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -46,8 +46,6 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 6
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: build_vector2:
@@ -113,8 +111,6 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: build_vector4:
@@ -170,8 +166,6 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x60005
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: build_vector_v2i16:
@@ -236,8 +230,6 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: build_vector_v2i16_trunc:
@@ -310,8 +302,6 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 29770738f83d57..b334047d325555 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -32,8 +32,6 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   store i32 0, ptr addrspace(1) %out
@@ -487,8 +485,6 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <2 x i16> %arg0, <i16 1, i16 1>
   store <2 x i16> %add, ptr addrspace(1) undef
@@ -524,8 +520,6 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <2 x i16> %arg0, <i16 1, i16 1>
   store <2 x i16> %add, ptr addrspace(1) undef
@@ -648,8 +642,6 @@ define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
   store <3 x i32> %add, ptr addrspace(1) undef
@@ -682,8 +674,6 @@ define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
 ; GFX11-NEXT:    v_add_f32_e64 v1, s1, 2.0
 ; GFX11-NEXT:    v_add_f32_e64 v0, s0, 1.0
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %add, ptr addrspace(1) undef
@@ -739,8 +729,6 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
   store <5 x i32> %add, ptr addrspace(1) undef
@@ -782,8 +770,6 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
   store <5 x float> %add, ptr addrspace(1) undef
@@ -816,8 +802,6 @@ define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v1
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
   store <3 x i32> %add, ptr addrspace(1) undef
@@ -849,8 +833,6 @@ define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
 ; GFX11-NEXT:    v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1
 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %add, ptr addrspace(1) undef
@@ -892,8 +874,6 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
   store <5 x i32> %add, ptr addrspace(1) undef
@@ -933,8 +913,6 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
   store <5 x float> %add, ptr addrspace(1) undef
@@ -960,8 +938,6 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add i16 %arg0, %arg0
   store i16 %add, ptr addrspace(1) undef
@@ -993,8 +969,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 ; GFX11-NEXT:    s_add_i32 s0, s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add i16 %arg0, %arg0
   store i16 %add, ptr addrspace(1) undef
@@ -1038,8 +1012,6 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add i8 %arg0, %arg0
@@ -1096,8 +1068,6 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add <2 x i8> %arg0, %arg0
@@ -1184,8 +1154,6 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add <4 x i8> %arg0, %arg0
@@ -1259,8 +1227,6 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add <3 x i8> %arg0, %arg0
@@ -1362,8 +1328,6 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add <5 x i8> %arg0, %arg0
@@ -1500,8 +1464,6 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add <8 x i8> %arg0, %arg0
@@ -1738,8 +1700,6 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add <16 x i8> %arg0, %arg0
@@ -2187,8 +2147,6 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add <32 x i8> %arg0, %arg0
@@ -2215,8 +2173,6 @@ define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) undef
   ret void
@@ -2305,8 +2261,6 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <8 x i1> %arg0, ptr addrspace(1) undef
   ret void
@@ -2467,8 +2421,6 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <16 x i1> %arg0, ptr addrspace(1) undef
   ret void
@@ -2768,8 +2720,6 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <32 x i1> %arg0, ptr addrspace(1) undef
   ret void
@@ -2798,8 +2748,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) undef
   ret void
@@ -2888,8 +2836,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <8 x i1> %arg0, ptr addrspace(1) undef
   ret void
@@ -3050,8 +2996,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <16 x i1> %arg0, ptr addrspace(1) undef
   ret void
@@ -3356,8 +3300,6 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <32 x i1> %arg0, ptr addrspace(1) undef
   ret void
@@ -3382,8 +3324,6 @@ define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) undef
   ret void
@@ -3405,8 +3345,6 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) {
 ; GFX11-LABEL: amdgpu_cs_i1_zext:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 0cc10512af5cd6..8352376a9c1371 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -112,8 +112,6 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add i64 %a, %b
@@ -212,8 +210,6 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %add = add i64 20015998343286, %a
@@ -304,8 +300,6 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -397,8 +391,6 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -494,8 +486,6 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -616,8 +606,6 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -753,8 +741,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -888,8 +874,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1003,8 +987,6 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %sub = sub i64 %a, %b
@@ -1103,8 +1085,6 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %sub = sub i64 20015998343286, %a
@@ -1195,8 +1175,6 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 ; GFX11-NEXT:    v_sub_co_u32 v0, s2, s2, v0
 ; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1288,8 +1266,6 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1386,8 +1362,6 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
@@ -1508,8 +1482,6 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
@@ -1645,8 +1617,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
@@ -1780,8 +1750,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2940,8 +2908,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:  .LBB16_3:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB16_4:
 ; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 3fc5d0d4b279eb..78f61ad906ce3c 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -581,8 +581,6 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %loc = alloca [3 x i16], align 2, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index bafbc9486a1c21..b17e1a08074117 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -60,8 +60,6 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -143,8 +141,6 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -214,8 +210,6 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -289,8 +283,6 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -364,8 +356,6 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v1, 0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -433,8 +423,6 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -505,8 +493,6 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
@@ -577,8 +563,6 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
@@ -651,8 +635,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
 ; GFX11-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x float>, ptr addrspace(1) %aptr, i32 %tid
@@ -721,8 +703,6 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 clamp
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
@@ -802,8 +782,6 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -886,8 +864,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -967,8 +943,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -1060,8 +1034,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -1146,8 +1118,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -1234,8 +1204,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -1320,8 +1288,6 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -1404,8 +1370,6 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -1491,8 +1455,6 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -1579,8 +1541,6 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 94482d7f0d809b..311feafe3f43f7 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -61,8 +61,6 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_f32:
@@ -76,8 +74,6 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -146,8 +142,6 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_neg_f32:
@@ -161,8 +155,6 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -232,8 +224,6 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1| clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_negabs_f32:
@@ -247,8 +237,6 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1| clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -327,8 +315,6 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_negzero_f32:
@@ -343,8 +329,6 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX12-NEXT:    v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -423,8 +407,6 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32:
@@ -439,8 +421,6 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
 ; GFX12-NEXT:    v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -527,8 +507,6 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_multi_use_max_f32:
@@ -548,8 +526,6 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -620,8 +596,6 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_f16:
@@ -635,8 +609,6 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
@@ -706,8 +678,6 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_neg_f16:
@@ -721,8 +691,6 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1 clamp
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
@@ -793,8 +761,6 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_negabs_f16:
@@ -808,8 +774,6 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
@@ -881,8 +845,6 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_f64:
@@ -896,8 +858,6 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
@@ -966,8 +926,6 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_neg_f64:
@@ -981,8 +939,6 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
@@ -1052,8 +1008,6 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_negabs_f64:
@@ -1067,8 +1021,6 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
@@ -1143,8 +1095,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, 0x80000000, 1.0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_aby_negzero_f32:
@@ -1158,8 +1108,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v1, 0x80000000, 1.0, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1226,8 +1174,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_aby_f32:
@@ -1241,8 +1187,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1309,8 +1253,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_bay_f32:
@@ -1324,8 +1266,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1392,8 +1332,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_yab_f32:
@@ -1407,8 +1345,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1475,8 +1411,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_yba_f32:
@@ -1490,8 +1424,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1558,8 +1490,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_ayb_f32:
@@ -1573,8 +1503,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1641,8 +1569,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_bya_f32:
@@ -1656,8 +1582,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1710,8 +1634,6 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constants_to_one_f32:
@@ -1722,8 +1644,6 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -1773,8 +1693,6 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constants_to_zero_f32:
@@ -1785,8 +1703,6 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -1837,8 +1753,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constant_preserve_f32:
@@ -1849,8 +1763,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -1901,8 +1813,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32:
@@ -1913,8 +1823,6 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -1964,8 +1872,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constant_qnan_f32:
@@ -1976,8 +1882,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -2027,8 +1931,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constant_snan_f32:
@@ -2039,8 +1941,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -2113,8 +2013,6 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_f32_no_dx10_clamp:
@@ -2128,8 +2026,6 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2199,8 +2095,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_f32_snan_dx10clamp:
@@ -2214,8 +2108,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2289,8 +2181,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp:
@@ -2304,8 +2194,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2378,8 +2266,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
@@ -2393,8 +2279,6 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_add_f32_e64 v1, v1, 1.0 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2464,8 +2348,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
@@ -2479,8 +2361,6 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2547,8 +2427,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
@@ -2562,8 +2440,6 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2630,8 +2506,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
@@ -2645,8 +2519,6 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2713,8 +2585,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 1.0, 0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
@@ -2728,8 +2598,6 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2796,8 +2664,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, 0, v1, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
@@ -2811,8 +2677,6 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2879,8 +2743,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, 1.0, v1, 0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
@@ -2894,8 +2756,6 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2948,8 +2808,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
@@ -2960,8 +2818,6 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -3012,8 +2868,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
@@ -3024,8 +2878,6 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -3098,8 +2950,6 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_v2f16:
@@ -3113,8 +2963,6 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3200,8 +3048,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_v2f16_undef_elt:
@@ -3215,8 +3061,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3302,8 +3146,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_v2f16_not_zero:
@@ -3320,8 +3162,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3406,8 +3246,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_v2f16_not_one:
@@ -3424,8 +3262,6 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3503,8 +3339,6 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_neg_v2f16:
@@ -3518,8 +3352,6 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3600,8 +3432,6 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_negabs_v2f16:
@@ -3616,8 +3446,6 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3698,8 +3526,6 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_neglo_v2f16:
@@ -3713,8 +3539,6 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3794,8 +3618,6 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_neghi_v2f16:
@@ -3809,8 +3631,6 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3890,8 +3710,6 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_v2f16_shuffle:
@@ -3905,8 +3723,6 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -3993,8 +3809,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0:
@@ -4008,8 +3822,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -4095,8 +3907,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1:
@@ -4110,8 +3920,6 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <2 x half>, ptr addrspace(1) %aptr, i32 %tid
@@ -4191,8 +3999,6 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e64 v0, v0, v1 clamp
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1] offset:12
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_clamp_diff_source_f32:
@@ -4208,8 +4014,6 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
 ; GFX12-NEXT:    s_max_num_f32 s2, s2, s3
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, s2, s2 clamp
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] offset:12
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 {
   %gep1 = getelementptr float, ptr addrspace(1) %aptr, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
index a33b6cb5cf4051..cc29152b36024d 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
@@ -11,8 +11,6 @@ define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_olt_to_ogt:
@@ -24,8 +22,6 @@ define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp olt float 2.0, %a
@@ -43,8 +39,6 @@ define amdgpu_vs void @fcmp_f32_ogt_to_olt(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_ogt_to_olt:
@@ -56,8 +50,6 @@ define amdgpu_vs void @fcmp_f32_ogt_to_olt(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ogt float 2.0, %a
@@ -75,8 +67,6 @@ define amdgpu_vs void @fcmp_f32_ole_to_oge(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_ole_to_oge:
@@ -88,8 +78,6 @@ define amdgpu_vs void @fcmp_f32_ole_to_oge(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ole float 2.0, %a
@@ -107,8 +95,6 @@ define amdgpu_vs void @fcmp_f32_oge_to_ole(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_oge_to_ole:
@@ -120,8 +106,6 @@ define amdgpu_vs void @fcmp_f32_oge_to_ole(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp oge float 2.0, %a
@@ -139,8 +123,6 @@ define amdgpu_vs void @fcmp_f32_ult_to_ugt(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_ult_to_ugt:
@@ -152,8 +134,6 @@ define amdgpu_vs void @fcmp_f32_ult_to_ugt(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ult float 2.0, %a
@@ -171,8 +151,6 @@ define amdgpu_vs void @fcmp_f32_ugt_to_ult(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_ugt_to_ult:
@@ -184,8 +162,6 @@ define amdgpu_vs void @fcmp_f32_ugt_to_ult(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ugt float 2.0, %a
@@ -203,8 +179,6 @@ define amdgpu_vs void @fcmp_f32_ule_to_uge(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_ule_to_uge:
@@ -216,8 +190,6 @@ define amdgpu_vs void @fcmp_f32_ule_to_uge(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ule float 2.0, %a
@@ -235,8 +207,6 @@ define amdgpu_vs void @fcmp_f32_uge_to_ule(ptr addrspace(1) inreg %out, float in
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f32_uge_to_ule:
@@ -248,8 +218,6 @@ define amdgpu_vs void @fcmp_f32_uge_to_ule(ptr addrspace(1) inreg %out, float in
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp uge float 2.0, %a
@@ -267,8 +235,6 @@ define amdgpu_vs void @fcmp_f16_olt_to_ogt(ptr addrspace(1) inreg %out, half inr
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f16_olt_to_ogt:
@@ -280,8 +246,6 @@ define amdgpu_vs void @fcmp_f16_olt_to_ogt(ptr addrspace(1) inreg %out, half inr
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp olt half 2.0, %a
@@ -299,8 +263,6 @@ define amdgpu_vs void @fcmp_f16_ogt_to_olt(ptr addrspace(1) inreg %out, half inr
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f16_ogt_to_olt:
@@ -312,8 +274,6 @@ define amdgpu_vs void @fcmp_f16_ogt_to_olt(ptr addrspace(1) inreg %out, half inr
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ogt half 2.0, %a
@@ -331,8 +291,6 @@ define amdgpu_vs void @fcmp_f16_ole_to_oge(ptr addrspace(1) inreg %out, half inr
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f16_ole_to_oge:
@@ -344,8 +302,6 @@ define amdgpu_vs void @fcmp_f16_ole_to_oge(ptr addrspace(1) inreg %out, half inr
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ole half 2.0, %a
@@ -363,8 +319,6 @@ define amdgpu_vs void @fcmp_f16_oge_to_ole(ptr addrspace(1) inreg %out, half inr
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f16_oge_to_ole:
@@ -376,8 +330,6 @@ define amdgpu_vs void @fcmp_f16_oge_to_ole(ptr addrspace(1) inreg %out, half inr
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp oge half 2.0, %a
@@ -395,8 +347,6 @@ define amdgpu_vs void @fcmp_f16_ult_to_ugt(ptr addrspace(1) inreg %out, half inr
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f16_ult_to_ugt:
@@ -408,8 +358,6 @@ define amdgpu_vs void @fcmp_f16_ult_to_ugt(ptr addrspace(1) inreg %out, half inr
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ult half 2.0, %a
@@ -427,8 +375,6 @@ define amdgpu_vs void @fcmp_f16_ugt_to_ult(ptr addrspace(1) inreg %out, half inr
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_f16_ugt_to_ult:
@@ -440,8 +386,6 @@ define amdgpu_vs void @fcmp_f16_ugt_to_ult(ptr addrspace(1) inreg %out, half inr
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ugt half 2.0, %a
@@ -459,8 +403,6 @@ define amdgpu_vs void @fcmp_ule_to_uge(ptr addrspace(1) inreg %out, half inreg %
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_ule_to_uge:
@@ -472,8 +414,6 @@ define amdgpu_vs void @fcmp_ule_to_uge(ptr addrspace(1) inreg %out, half inreg %
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ule half 2.0, %a
@@ -491,8 +431,6 @@ define amdgpu_vs void @fcmp_uge_to_ule(ptr addrspace(1) inreg %out, half inreg %
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: fcmp_uge_to_ule:
@@ -504,8 +442,6 @@ define amdgpu_vs void @fcmp_uge_to_ule(ptr addrspace(1) inreg %out, half inreg %
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp uge half 2.0, %a
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 93e14a205f05d4..a7522ef761b8ab 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -96,8 +96,6 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n
 ; GFX11-NEXT:    s_min_u32 s2, s2, 32
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, ptr addrspace(1) %out, align 4
@@ -200,8 +198,6 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -322,8 +318,6 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -471,8 +465,6 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
@@ -591,8 +583,6 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
@@ -686,8 +676,6 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; GFX11-NEXT:    s_min_u32 s0, s0, 64
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v1, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, ptr addrspace(1) %out
@@ -771,8 +759,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; GFX11-NEXT:    s_min_u32 s2, s2, 64
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %ctlz to i32
@@ -897,8 +883,6 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1026,8 +1010,6 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
 ; GFX11-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
 ; GFX11-NEXT:    v_min3_u32 v0, v0, v1, 64
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -1135,8 +1117,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1244,8 +1224,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1369,8 +1347,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1493,8 +1469,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
@@ -1607,8 +1581,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
@@ -1735,8 +1707,6 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
@@ -1855,8 +1825,6 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index f3f749b5c054b3..850e701513fd7a 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -962,8 +962,6 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
@@ -1050,8 +1048,6 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -1143,8 +1139,6 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -1241,8 +1235,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -1380,8 +1372,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -1553,8 +1543,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
 ; GFX11-NEXT:    global_store_b32 v6, v4, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -1748,8 +1736,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    global_store_b32 v4, v5, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -1937,8 +1923,6 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -2061,8 +2045,6 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v10, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v10, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -2148,8 +2130,6 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
@@ -2232,8 +2212,6 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
@@ -2312,8 +2290,6 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
@@ -2448,8 +2424,6 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
@@ -2531,8 +2505,6 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
@@ -2614,8 +2586,6 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
@@ -2698,8 +2668,6 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
@@ -2782,8 +2750,6 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
@@ -2879,8 +2845,6 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -3021,8 +2985,6 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
 ; GFX11-NEXT:    global_store_b8 v[0:1], v3, off
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    global_store_b8 v[0:1], v1, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   br label %for.body.i
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 5fae60a7acac14..f139943ff2bcbf 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -49,8 +49,6 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp = insertelement <2 x i16> undef, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
@@ -134,8 +132,6 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
@@ -219,8 +215,6 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
@@ -402,8 +396,6 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %shift = lshr i32 %b, 16
   %tr = trunc i32 %shift to i16
@@ -492,8 +484,6 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %shift_a = lshr i32 %a, 16
   %tr_a = trunc i32 %shift_a to i16
@@ -730,8 +720,6 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in,
 ; GFX11-NEXT:    ds_load_u16_d16 v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %load = load i16, ptr addrspace(3) %in
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 41a9d7999e80a3..352d55073d6722 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -120,8 +120,6 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 ; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -197,8 +195,6 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
 ; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 -1, %x.i
@@ -602,8 +598,6 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
 ; GFX11-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 54fb1dc5c05274..d6bde79802847c 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -194,8 +194,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 10
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   %i1 = add i32 %i, %i
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index f4ec16db55d68a..858eaace8dcbdf 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -47,8 +47,6 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:20
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <2 x half> %vec, i32 0
@@ -107,8 +105,6 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %elt = extractelement <2 x half> %vec, i32 %idx
@@ -178,8 +174,6 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -234,8 +228,6 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 offset:2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
@@ -290,8 +282,6 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -347,8 +337,6 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    global_load_b32 v1, v1, s[2:3] offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -422,8 +410,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v3, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -486,8 +472,6 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load <16 x half>, ptr addrspace(4) %ptr
   %elt0 = extractelement <16 x half> %load, i32 0
@@ -547,8 +531,6 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load <16 x half>, ptr addrspace(4) %ptr
   %elt2 = extractelement <16 x half> %load, i32 2
@@ -696,8 +678,6 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -952,8 +932,6 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 21799ab79b8396..986f27b19dba27 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -54,8 +54,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %bc= bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
@@ -109,8 +107,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call half @llvm.fabs.f16(half %in)
   store half %fabs, ptr addrspace(1) %out
@@ -163,8 +159,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   store <2 x half> %fabs, ptr addrspace(1) %out
@@ -220,8 +214,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   store <4 x half> %fabs, ptr addrspace(1) %out
@@ -281,8 +273,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mul_f16_e64 v1, |s4|, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call half @llvm.fabs.f16(half %in0)
   %fmul = fmul half %fabs, %in1
@@ -341,8 +331,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
@@ -399,8 +387,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %bc = bitcast i32 %in to <2 x half>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
@@ -481,8 +467,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
@@ -571,8 +555,6 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
@@ -660,8 +642,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
@@ -742,8 +722,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 1094b768f1bd16..e3c9e8ccdc3998 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -80,8 +80,6 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fadd_f16:
@@ -102,8 +100,6 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-SDAG-LABEL: fadd_f16:
@@ -128,8 +124,6 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-FAKE16-SDAG-NEXT:    s_nop 0
-; GFX11-FAKE16-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-GISEL-LABEL: fadd_f16:
@@ -150,8 +144,6 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-GISEL-NEXT:    s_nop 0
-; GFX11-FAKE16-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
 ; GFX11-LABEL: fadd_f16:
 ; GFX11:       ; %bb.0: ; %entry
@@ -246,8 +238,6 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fadd_f16_imm_a:
@@ -262,8 +252,6 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a:
@@ -282,8 +270,6 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-SDAG-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-SDAG-NEXT:    s_nop 0
-; GFX11-FAKE16-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a:
@@ -298,8 +284,6 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-GISEL-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-FAKE16-GISEL-NEXT:    s_nop 0
-; GFX11-FAKE16-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
 ; GFX11-LABEL: fadd_f16_imm_a:
 ; GFX11:       ; %bb.0: ; %entry
@@ -386,8 +370,6 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fadd_f16_imm_b:
@@ -402,8 +384,6 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b:
@@ -422,8 +402,6 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-SDAG-NEXT:    v_add_f16_e32 v0, 2.0, v0
 ; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-SDAG-NEXT:    s_nop 0
-; GFX11-FAKE16-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b:
@@ -438,8 +416,6 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v0
 ; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-FAKE16-GISEL-NEXT:    s_nop 0
-; GFX11-FAKE16-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
 ; GFX11-LABEL: fadd_f16_imm_b:
 ; GFX11:       ; %bb.0: ; %entry
@@ -550,8 +526,6 @@ define amdgpu_kernel void @fadd_v2f16(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fadd_v2f16:
@@ -571,8 +545,6 @@ define amdgpu_kernel void @fadd_v2f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v1, v0
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16:
@@ -594,8 +566,6 @@ define amdgpu_kernel void @fadd_v2f16(
 ; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-SDAG-NEXT:    v_pk_add_f16 v0, v1, v0
 ; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-SDAG-NEXT:    s_nop 0
-; GFX11-FAKE16-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16:
@@ -615,8 +585,6 @@ define amdgpu_kernel void @fadd_v2f16(
 ; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-GISEL-NEXT:    v_pk_add_f16 v0, v1, v0
 ; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-GISEL-NEXT:    s_nop 0
-; GFX11-FAKE16-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
 ; GFX11-LABEL: fadd_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
@@ -716,8 +684,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fadd_v2f16_imm_a:
@@ -733,8 +699,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a:
@@ -752,8 +716,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
 ; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-SDAG-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
 ; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-SDAG-NEXT:    s_nop 0
-; GFX11-FAKE16-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a:
@@ -769,8 +731,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
 ; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-GISEL-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
 ; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-GISEL-NEXT:    s_nop 0
-; GFX11-FAKE16-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
 ; GFX11-LABEL: fadd_v2f16_imm_a:
 ; GFX11:       ; %bb.0: ; %entry
@@ -863,8 +823,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fadd_v2f16_imm_b:
@@ -880,8 +838,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b:
@@ -899,8 +855,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
 ; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-SDAG-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
 ; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-SDAG-NEXT:    s_nop 0
-; GFX11-FAKE16-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b:
@@ -916,8 +870,6 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
 ; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-GISEL-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
 ; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-GISEL-NEXT:    s_nop 0
-; GFX11-FAKE16-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
 ; GFX11-LABEL: fadd_v2f16_imm_b:
 ; GFX11:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 7252c69cb1cf75..dddd649888af27 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -121,8 +121,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_2xi16_align2:
@@ -131,8 +129,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
   store i16 1, ptr addrspace(1) %r, align 2
@@ -278,8 +274,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_2xi16_align1:
@@ -288,8 +282,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
   store i16 1, ptr addrspace(1) %r, align 1
@@ -400,8 +392,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_2xi16_align4:
@@ -410,8 +400,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
   store i16 1, ptr addrspace(1) %r, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 7af972b96ec68c..26580618794d34 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -53,8 +53,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half undef)
   store half %canonicalized, ptr addrspace(1) %out
@@ -107,8 +105,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
@@ -160,8 +156,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, s4, s4
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
@@ -254,8 +248,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
@@ -310,8 +302,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
@@ -367,8 +357,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fneg = fneg half %val
@@ -423,8 +411,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fneg = fneg half %val
@@ -479,8 +465,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
@@ -525,8 +509,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, ptr addrspace(1) %out
@@ -569,8 +551,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, ptr addrspace(1) %out
@@ -613,8 +593,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, ptr addrspace(1) %out
@@ -657,8 +635,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, ptr addrspace(1) %out
@@ -701,8 +677,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, ptr addrspace(1) %out
@@ -745,8 +719,6 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, ptr addrspace(1) %out
@@ -789,8 +761,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, ptr addrspace(1) %out
@@ -833,8 +803,6 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, ptr addrspace(1) %out
@@ -877,8 +845,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, ptr addrspace(1) %out
@@ -921,8 +887,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, ptr addrspace(1) %out
@@ -965,8 +929,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, ptr addrspace(1) %out
@@ -1009,8 +971,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, ptr addrspace(1) %out
@@ -1053,8 +1013,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, ptr addrspace(1) %out
@@ -1097,8 +1055,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, ptr addrspace(1) %out
@@ -1141,8 +1097,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, ptr addrspace(1) %out
@@ -1185,8 +1139,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, ptr addrspace(1) %out
@@ -1258,8 +1210,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
@@ -1336,8 +1286,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
@@ -1416,8 +1364,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
@@ -1495,8 +1441,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
@@ -1560,8 +1504,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, s4, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = bitcast i32 %val.arg to <2 x half>
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
@@ -1604,8 +1546,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1648,8 +1588,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1692,8 +1630,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1736,8 +1672,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1780,8 +1714,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1824,8 +1756,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1868,8 +1798,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1912,8 +1840,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -1956,8 +1882,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2000,8 +1924,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2044,8 +1966,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2088,8 +2008,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2132,8 +2050,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2176,8 +2092,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2220,8 +2134,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2264,8 +2176,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2387,8 +2297,6 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
   store <2 x half> %canonicalized, ptr addrspace(1) %out
@@ -2694,8 +2602,6 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
   store <4 x half> %canonicalized, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index f0ce96af90649d..adf1635b29145c 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -53,8 +53,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_var_f32:
@@ -66,8 +64,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
@@ -117,8 +113,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, s4, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_test_canonicalize_var_f32:
@@ -128,8 +122,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, s2, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, ptr addrspace(1) %out
@@ -169,8 +161,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32:
@@ -182,8 +172,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
@@ -225,8 +213,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
@@ -238,8 +224,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1|
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
@@ -282,8 +266,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32:
@@ -295,8 +277,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %val.fneg = fneg float %val
@@ -330,8 +310,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_undef_f32:
@@ -340,8 +318,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float undef)
   store float %canonicalized, ptr addrspace(1) %out
@@ -373,8 +349,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_p0_f32:
@@ -383,8 +357,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, ptr addrspace(1) %out
@@ -418,8 +390,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_n0_f32:
@@ -429,8 +399,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, ptr addrspace(1) %out
@@ -463,8 +431,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_p1_f32:
@@ -473,8 +439,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, ptr addrspace(1) %out
@@ -507,8 +471,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_n1_f32:
@@ -517,8 +479,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, ptr addrspace(1) %out
@@ -551,8 +511,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_literal_f32:
@@ -561,8 +519,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, ptr addrspace(1) %out
@@ -594,8 +550,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
@@ -604,8 +558,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -641,8 +593,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX11-NEXT:    v_max_f32_e64 v1, 0x7fffff, 0x7fffff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
@@ -652,8 +602,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -689,8 +637,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX11-NEXT:    v_max_f32_e64 v1, 0x7fffff, 0x7fffff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
@@ -700,8 +646,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -737,8 +681,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX11-NEXT:    v_max_f32_e64 v1, 0x7fffff, 0x7fffff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
@@ -748,8 +690,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX12-NEXT:    v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -782,8 +722,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
@@ -792,8 +730,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -827,8 +763,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
 ; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
@@ -838,8 +772,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
 ; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -872,8 +804,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
@@ -882,8 +812,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -916,8 +844,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_qnan_f32:
@@ -926,8 +852,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
   store float %canonicalized, ptr addrspace(1) %out
@@ -960,8 +884,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
@@ -970,8 +892,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -1004,8 +924,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
@@ -1014,8 +932,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -1048,8 +964,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32:
@@ -1058,8 +972,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -1092,8 +1004,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32:
@@ -1102,8 +1012,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -1136,8 +1044,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32:
@@ -1146,8 +1052,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -1180,8 +1084,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32:
@@ -1190,8 +1092,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
   store float %canonicalized, ptr addrspace(1) %out
@@ -1231,8 +1131,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_var_f64:
@@ -1244,8 +1142,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load double, ptr addrspace(1) %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
@@ -1290,8 +1186,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_test_canonicalize_var_f64:
@@ -1301,8 +1195,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], s[2:3], s[2:3]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1342,8 +1234,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], |v[0:1]|, |v[0:1]|
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64:
@@ -1355,8 +1245,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], |v[0:1]|, |v[0:1]|
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load double, ptr addrspace(1) %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
@@ -1398,8 +1286,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]|
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
@@ -1411,8 +1297,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]|
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load double, ptr addrspace(1) %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
@@ -1455,8 +1339,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64:
@@ -1468,8 +1350,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %val = load double, ptr addrspace(1) %out
   %val.fneg = fneg double %val
@@ -1507,8 +1387,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_p0_f64:
@@ -1519,8 +1397,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1555,8 +1431,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_n0_f64:
@@ -1566,8 +1440,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1601,8 +1473,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_p1_f64:
@@ -1611,8 +1481,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1646,8 +1514,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_n1_f64:
@@ -1656,8 +1522,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1691,8 +1555,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_literal_f64:
@@ -1701,8 +1563,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1738,8 +1598,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
@@ -1750,8 +1608,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX12-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -1787,8 +1643,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX11-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
@@ -1798,8 +1652,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX12-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -1834,8 +1686,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 ; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
@@ -1845,8 +1695,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 ; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -1882,8 +1730,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX11-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
@@ -1893,8 +1739,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX12-NEXT:    v_mov_b32_e32 v0, -1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -1928,8 +1772,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_qnan_f64:
@@ -1938,8 +1780,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, ptr addrspace(1) %out
@@ -1973,8 +1813,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
@@ -1983,8 +1821,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -2018,8 +1854,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
@@ -2028,8 +1862,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -2063,8 +1895,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64:
@@ -2073,8 +1903,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -2108,8 +1936,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64:
@@ -2118,8 +1944,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -2153,8 +1977,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64:
@@ -2163,8 +1985,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -2198,8 +2018,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64:
@@ -2208,8 +2026,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, ptr addrspace(1) %out
@@ -2273,8 +2089,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_f64_flush:
@@ -2288,8 +2102,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
@@ -2357,8 +2169,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_f32_flush:
@@ -2372,8 +2182,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
@@ -2442,8 +2250,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_f16_flush:
@@ -2457,8 +2263,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
@@ -2536,8 +2340,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_v2f16_flush:
@@ -2551,8 +2353,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id
@@ -2620,8 +2420,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_f64_denorm:
@@ -2635,8 +2433,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
@@ -2704,8 +2500,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_f32_denorm:
@@ -2719,8 +2513,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
@@ -2790,8 +2582,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_f16_denorm:
@@ -2805,8 +2595,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
@@ -2884,8 +2672,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm:
@@ -2899,8 +2685,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id
@@ -2972,8 +2756,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_test_canonicalize_var_v2f64:
@@ -2989,8 +2771,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x double>, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 845b25a8f61bd7..59e52a86a2f5e8 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -77,8 +77,6 @@ define amdgpu_kernel void @fcmp_f16_lt(
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -167,8 +165,6 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -258,8 +254,6 @@ define amdgpu_kernel void @fcmp_f16_eq(
 ; GFX11-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -347,8 +341,6 @@ define amdgpu_kernel void @fcmp_f16_le(
 ; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -436,8 +428,6 @@ define amdgpu_kernel void @fcmp_f16_gt(
 ; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -525,8 +515,6 @@ define amdgpu_kernel void @fcmp_f16_lg(
 ; GFX11-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -614,8 +602,6 @@ define amdgpu_kernel void @fcmp_f16_ge(
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -703,8 +689,6 @@ define amdgpu_kernel void @fcmp_f16_o(
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -792,8 +776,6 @@ define amdgpu_kernel void @fcmp_f16_u(
 ; GFX11-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -881,8 +863,6 @@ define amdgpu_kernel void @fcmp_f16_nge(
 ; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -970,8 +950,6 @@ define amdgpu_kernel void @fcmp_f16_nlg(
 ; GFX11-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1059,8 +1037,6 @@ define amdgpu_kernel void @fcmp_f16_ngt(
 ; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1148,8 +1124,6 @@ define amdgpu_kernel void @fcmp_f16_nle(
 ; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1237,8 +1211,6 @@ define amdgpu_kernel void @fcmp_f16_neq(
 ; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1326,8 +1298,6 @@ define amdgpu_kernel void @fcmp_f16_nlt(
 ; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1430,8 +1400,6 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1535,8 +1503,6 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
 ; GFX11-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1639,8 +1605,6 @@ define amdgpu_kernel void @fcmp_v2f16_le(
 ; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1743,8 +1707,6 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
 ; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1848,8 +1810,6 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
 ; GFX11-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1953,8 +1913,6 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2058,8 +2016,6 @@ define amdgpu_kernel void @fcmp_v2f16_o(
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2163,8 +2119,6 @@ define amdgpu_kernel void @fcmp_v2f16_u(
 ; GFX11-NEXT:    v_cmp_u_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2267,8 +2221,6 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
 ; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2371,8 +2323,6 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
 ; GFX11-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2476,8 +2426,6 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
 ; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2580,8 +2528,6 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
 ; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2684,8 +2630,6 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
 ; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -2788,8 +2732,6 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
 ; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index e6f9889440f0cd..13367d3bb36e29 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -71,8 +71,6 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s4, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out = call half @llvm.copysign.f16(half %mag, half %sign)
   store half %out, ptr addrspace(1) %arg_out
@@ -125,8 +123,6 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half %mag, half 0.0)
   store half %result, ptr addrspace(1) %out, align 4
@@ -179,8 +175,6 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half %mag, half 1.0)
   store half %result, ptr addrspace(1) %out, align 4
@@ -233,8 +227,6 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half %mag, half 10.0)
   store half %result, ptr addrspace(1) %out, align 4
@@ -287,8 +279,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half %mag, half -1.0)
   store half %result, ptr addrspace(1) %out, align 4
@@ -341,8 +331,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half %mag, half -10.0)
   store half %result, ptr addrspace(1) %out, align 4
@@ -398,8 +386,6 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half 0.0, half %sign)
   store half %result, ptr addrspace(1) %out, align 4
@@ -459,8 +445,6 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x3c00
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half 1.0, half %sign)
   store half %result, ptr addrspace(1) %out, align 4
@@ -520,8 +504,6 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x4900
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half 10.0, half %sign)
   store half %result, ptr addrspace(1) %out, align 4
@@ -580,8 +562,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x3c00
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half -1.0, half %sign)
   store half %result, ptr addrspace(1) %out, align 4
@@ -641,8 +621,6 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x4900
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call half @llvm.copysign.f16(half -10.0, half %sign)
   store half %result, ptr addrspace(1) %out, align 4
@@ -915,8 +893,6 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
@@ -1019,8 +995,6 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, v1
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
@@ -1120,8 +1094,6 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
@@ -1223,8 +1195,6 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid
@@ -1326,8 +1296,6 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
@@ -1426,8 +1394,6 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
@@ -1531,8 +1497,6 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
@@ -1787,8 +1751,6 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %mag.trunc = fptrunc double %mag to half
   %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
@@ -1877,8 +1839,6 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
   store <2 x half> %out, ptr addrspace(1) %arg_out
@@ -1989,8 +1949,6 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1] offset:4
 ; GFX11-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
   store <3 x half> %out, ptr addrspace(1) %arg_out
@@ -2118,8 +2076,6 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v0
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v4
 ; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
   store <4 x half> %out, ptr addrspace(1) %arg_out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 542d67486e7580..43cf26c422a7cf 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -40,8 +40,6 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
   store float %result, ptr addrspace(1) %out, align 4
@@ -83,8 +81,6 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float %mag, float 0.0)
   store float %result, ptr addrspace(1) %out, align 4
@@ -126,8 +122,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float %mag, float 1.0)
   store float %result, ptr addrspace(1) %out, align 4
@@ -169,8 +163,6 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float %mag, float 10.0)
   store float %result, ptr addrspace(1) %out, align 4
@@ -212,8 +204,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float %mag, float -1.0)
   store float %result, ptr addrspace(1) %out, align 4
@@ -255,8 +245,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float %mag, float -10.0)
   store float %result, ptr addrspace(1) %out, align 4
@@ -298,8 +286,6 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float 0.0, float %sign)
   store float %result, ptr addrspace(1) %out, align 4
@@ -345,8 +331,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa
 ; GFX11-NEXT:    s_or_b32 s2, s2, 1.0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float 1.0, float %sign)
   store float %result, ptr addrspace(1) %out, align 4
@@ -391,8 +375,6 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x41200000
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float 10.0, float %sign)
   store float %result, ptr addrspace(1) %out, align 4
@@ -437,8 +419,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f
 ; GFX11-NEXT:    s_or_b32 s2, s2, 1.0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float -1.0, float %sign)
   store float %result, ptr addrspace(1) %out, align 4
@@ -483,8 +463,6 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x41200000
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call float @llvm.copysign.f32(float -10.0, float %sign)
   store float %result, ptr addrspace(1) %out, align 4
@@ -538,8 +516,6 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s5, v0
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s4, v2
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, ptr addrspace(1) %out, align 8
@@ -603,8 +579,6 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s4, v3
 ; GFX11-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign)
   store <3 x float> %result, ptr addrspace(1) %out, align 16
@@ -674,8 +648,6 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s5, v4
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s4, v5
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
   store <4 x float> %result, ptr addrspace(1) %out, align 16
@@ -946,8 +918,6 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sign.trunc = fptrunc double %sign to float
   %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
@@ -991,8 +961,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_or_b32 s2, s2, 1.0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sign.trunc = fptrunc double %sign to float
   %result = call float @llvm.copysign.f32(float 1.0, float %sign.trunc)
@@ -1038,8 +1006,6 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sign.ext = fpext half %sign to float
   %result = call float @llvm.copysign.f32(float %mag, float %sign.ext)
@@ -1089,8 +1055,6 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sign.ext = fpext half %sign to float
   %result = call float @llvm.copysign.f32(float 1.0, float %sign.ext)
@@ -1137,8 +1101,6 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sign.ext = fpext bfloat %sign to float
   %result = call float @llvm.copysign.f32(float %mag, float %sign.ext)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 4300faa02742a3..1bcc4132007100 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -54,8 +54,6 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s5, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, ptr addrspace(1) %out, align 8
@@ -100,8 +98,6 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double 0.0)
   store double %result, ptr addrspace(1) %out, align 8
@@ -146,8 +142,6 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double 1.0)
   store double %result, ptr addrspace(1) %out, align 8
@@ -192,8 +186,6 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double 10.0)
   store double %result, ptr addrspace(1) %out, align 8
@@ -238,8 +230,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double -1.0)
   store double %result, ptr addrspace(1) %out, align 8
@@ -284,8 +274,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double %mag, double -10.0)
   store double %result, ptr addrspace(1) %out, align 8
@@ -338,8 +326,6 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sign.ext = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
@@ -393,8 +379,6 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sign.ext = fpext half %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
@@ -437,8 +421,6 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double 0.0, double %sign)
   store double %result, ptr addrspace(1) %out, align 4
@@ -483,8 +465,6 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x3ff00000
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double 1.0, double %sign)
   store double %result, ptr addrspace(1) %out, align 4
@@ -529,8 +509,6 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x40240000
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double 10.0, double %sign)
   store double %result, ptr addrspace(1) %out, align 4
@@ -575,8 +553,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x3ff00000
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double -1.0, double %sign)
   store double %result, ptr addrspace(1) %out, align 4
@@ -621,8 +597,6 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_or_b32 s2, s2, 0x40240000
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call double @llvm.copysign.f64(double -10.0, double %sign)
   store double %result, ptr addrspace(1) %out, align 4
@@ -682,8 +656,6 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s5, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, ptr addrspace(1) %out, align 16
@@ -760,8 +732,6 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign)
   store <3 x double> %result, ptr addrspace(1) %out, align 32
@@ -849,8 +819,6 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, ptr addrspace(1) %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 0c6805e3eba598..d1676b13c12963 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -161,8 +161,6 @@ define amdgpu_kernel void @v_fdiv_f16(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v2, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -262,8 +260,6 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -358,8 +354,6 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e64 v1, |v1|
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -457,8 +451,6 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -540,8 +532,6 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -636,8 +626,6 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e64 v1, -v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -735,8 +723,6 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -840,8 +826,6 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -950,8 +934,6 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1056,8 +1038,6 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1161,8 +1141,6 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1266,8 +1244,6 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v1, -v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1378,8 +1354,6 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1491,8 +1465,6 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1565,8 +1537,6 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
 ; GFX11-NEXT:    v_mul_f16_e32 v0, 0.5, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x = load half, ptr addrspace(1) undef
   %rcp = fdiv afn half %x, 2.0
@@ -1632,8 +1602,6 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
 ; GFX11-NEXT:    v_mul_f16_e32 v0, 0x2e66, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x = load half, ptr addrspace(1) undef
   %rcp = fdiv afn half %x, 10.0
@@ -1699,8 +1667,6 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
 ; GFX11-NEXT:    v_mul_f16_e32 v0, 0xae66, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x = load half, ptr addrspace(1) undef
   %rcp = fdiv afn half %x, -10.0
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 93105e57a59187..f34739c5ca25b2 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -157,8 +157,6 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_f32_ninf:
@@ -310,8 +308,6 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_f32_ieee:
@@ -395,8 +391,6 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo
 ; GFX11-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_25ulp_f32:
@@ -511,8 +505,6 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_25ulp_ieee_f32:
@@ -575,8 +567,6 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a,
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_fast_ieee_f32:
@@ -639,8 +629,6 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a,
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_f32_fast_math:
@@ -703,8 +691,6 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_ulp25_f32_fast_math:
@@ -868,8 +854,6 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a,
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_f32_arcp_daz:
@@ -932,8 +916,6 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a,
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_f32_arcp_ninf:
@@ -1188,8 +1170,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v3, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s6, s4
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32:
@@ -1266,8 +1246,6 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_ulp25_v2f32:
@@ -1344,8 +1322,6 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32_fast_math:
@@ -1422,8 +1398,6 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v2f32_arcp_math:
@@ -1856,8 +1830,6 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    v_div_fmas_f32 v0, v0, v5, v6
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s4, s0
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[8:9]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v4f32:
@@ -1964,8 +1936,6 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    v_dual_mul_f32 v2, s2, v1 :: v_dual_mul_f32 v1, s1, v4
 ; GFX11-NEXT:    v_mul_f32_e32 v0, s0, v5
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[8:9]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v4f32_fast_math:
@@ -2072,8 +2042,6 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    v_dual_mul_f32 v2, s2, v1 :: v_dual_mul_f32 v1, s1, v4
 ; GFX11-NEXT:    v_mul_f32_e32 v0, s0, v5
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[8:9]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_v4f32_arcp_math:
@@ -2248,8 +2216,6 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s4, 1.0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
@@ -2396,8 +2362,6 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v0, s4, 1.0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
index a78a8a2ed8fe66..67cfe52482542f 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
@@ -27,8 +27,6 @@ define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    buffer_store_b128 v[5:8], v6, s[0:3], 0 idxen
 ; CHECK-NEXT:    buffer_store_b128 v[1:4], v6, s[0:3], 0 idxen
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
 bb:
   %i = icmp eq i32 %arg, 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 3e4b43d9cfcd34..de6da549e98215 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4920,8 +4920,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: sgpr_base_large_offset:
@@ -4929,8 +4927,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: sgpr_base_large_offset:
@@ -4979,8 +4975,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-PAL-NEXT:    scratch_load_b32 v2, off, s0
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-PAL-NEXT:    s_nop 0
-; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
 ;
 ; GFX12-PAL-LABEL: sgpr_base_large_offset:
@@ -4988,8 +4982,6 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-PAL-NEXT:    s_nop 0
-; GFX12-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PAL-NEXT:    s_endpgm
 entry:
   %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
@@ -5030,8 +5022,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: sgpr_base_large_offset_split:
@@ -5041,8 +5031,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: sgpr_base_large_offset_split:
@@ -5095,8 +5083,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; GFX11-PAL-NEXT:    scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-PAL-NEXT:    s_nop 0
-; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
 ;
 ; GFX12-PAL-LABEL: sgpr_base_large_offset_split:
@@ -5106,8 +5092,6 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a
 ; GFX12-PAL-NEXT:    scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-PAL-NEXT:    s_nop 0
-; GFX12-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PAL-NEXT:    s_endpgm
 entry:
   ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
@@ -5259,8 +5243,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: sgpr_base_negative_offset:
@@ -5268,8 +5250,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: sgpr_base_negative_offset:
@@ -5316,8 +5296,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; GFX11-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-PAL-NEXT:    s_nop 0
-; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
 ;
 ; GFX12-PAL-LABEL: sgpr_base_negative_offset:
@@ -5325,8 +5303,6 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr
 ; GFX12-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-PAL-NEXT:    s_nop 0
-; GFX12-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PAL-NEXT:    s_endpgm
 entry:
   %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 4846e21fe836eb..c75521267ae7ce 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -54,8 +54,6 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -122,8 +120,6 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -183,8 +179,6 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -239,8 +233,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -307,8 +299,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -368,8 +358,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -436,8 +424,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -497,8 +483,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -567,8 +551,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -642,8 +624,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -743,8 +723,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
 ; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
 ; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
@@ -768,8 +746,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -869,8 +845,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
 ; GFX11-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
@@ -894,8 +868,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1]
 ; GFX11-FMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -987,8 +959,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y:
@@ -1004,8 +974,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                         ptr addrspace(1) %in1,
                                         ptr addrspace(1) %in2) {
@@ -1080,8 +1048,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one:
@@ -1097,8 +1063,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                         ptr addrspace(1) %in1,
                                         ptr addrspace(1) %in2) {
@@ -1173,8 +1137,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y:
@@ -1190,8 +1152,6 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                            ptr addrspace(1) %in1,
                                            ptr addrspace(1) %in2) {
@@ -1266,8 +1226,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone:
@@ -1283,8 +1241,6 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                            ptr addrspace(1) %in1,
                                            ptr addrspace(1) %in2) {
@@ -1359,8 +1315,6 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y:
@@ -1376,8 +1330,6 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                         ptr addrspace(1) %in1,
                                         ptr addrspace(1) %in2) {
@@ -1452,8 +1404,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x:
@@ -1469,8 +1419,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                         ptr addrspace(1) %in1,
                                         ptr addrspace(1) %in2) {
@@ -1545,8 +1493,6 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y:
@@ -1562,8 +1508,6 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, -v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                            ptr addrspace(1) %in1,
                                            ptr addrspace(1) %in2) {
@@ -1638,8 +1582,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x:
@@ -1655,8 +1597,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, -v1, v2, -v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                          ptr addrspace(1) %in1,
                                          ptr addrspace(1) %in2) {
@@ -1731,8 +1671,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y:
@@ -1748,8 +1686,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                         ptr addrspace(1) %in1,
                                         ptr addrspace(1) %in2) {
@@ -1824,8 +1760,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one:
@@ -1841,8 +1775,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, -v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                       ptr addrspace(1) %in1,
                                       ptr addrspace(1) %in2) {
@@ -1917,8 +1849,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y:
@@ -1934,8 +1864,6 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                          ptr addrspace(1) %in1,
                                          ptr addrspace(1) %in2) {
@@ -2010,8 +1938,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NOFMA-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone:
@@ -2027,8 +1953,6 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmac_f32_e32 v2, v1, v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                                          ptr addrspace(1) %in1,
                                          ptr addrspace(1) %in2) {
@@ -2124,8 +2048,6 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NOFMA-NEXT:    v_fmac_f32_e32 v2, v3, v1
 ; GFX11-NOFMA-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f32_interp:
@@ -2143,8 +2065,6 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                              ptr addrspace(1) %in1,
                              ptr addrspace(1) %in2,
@@ -2208,8 +2128,6 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
 ; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
 ; GFX11-NOFMA-NEXT:    global_store_b64 v8, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT:    s_nop 0
-; GFX11-NOFMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NOFMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: test_f64_interp:
@@ -2227,8 +2145,6 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
 ; GFX11-FMA-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
                              ptr addrspace(1) %in1,
                              ptr addrspace(1) %in2,
@@ -2275,8 +2191,6 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, 2.0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -2323,8 +2237,6 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, -2.0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -2380,8 +2292,6 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    v_fma_f32 v1, v9, -v5, -v1
 ; GFX11-NEXT:    v_fma_f32 v0, v8, -v4, -v0
 ; GFX11-NEXT:    global_store_b128 v12, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index f86f5305e6ba15..5caaa2c9550f9a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -123,8 +123,6 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -255,8 +253,6 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -394,8 +390,6 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -534,8 +528,6 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 04cd150d93176f..84c3913ec93c56 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -273,8 +273,6 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_maximum_f32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -297,8 +295,6 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_maximum_f16 v1, v1, v2
 ; GCN-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 4
   %b = load volatile half, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 6634d36122d0a6..58e864b496b374 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -105,8 +105,6 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -217,8 +215,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -330,8 +326,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -443,8 +437,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -561,8 +553,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_maxmin_f32 v1, v1, 4.0, 2.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -715,8 +705,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
@@ -735,8 +723,6 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -856,8 +842,6 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
@@ -963,8 +947,6 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1094,8 +1076,6 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
@@ -1114,8 +1094,6 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1288,8 +1266,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
@@ -1309,8 +1285,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1482,8 +1456,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, v1, -v2, v3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
@@ -1503,8 +1475,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1676,8 +1646,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, v1, v2, -v3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
@@ -1697,8 +1665,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -1873,8 +1839,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, -v1, |v2|, -|v3|
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
@@ -1895,8 +1859,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_f32 v1, v1, |v2|, v3
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2080,8 +2042,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, -|v1|, -|v2|, -|v3|
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
@@ -2103,8 +2063,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2284,8 +2242,6 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2443,8 +2399,6 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2597,8 +2551,6 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2751,8 +2703,6 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -2917,8 +2867,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -3071,8 +3019,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -3243,8 +3189,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
@@ -3264,8 +3208,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -3419,8 +3361,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -3573,8 +3513,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -3727,8 +3665,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -3881,8 +3817,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -4035,8 +3969,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -4189,8 +4121,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -4343,8 +4273,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -4497,8 +4425,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -4651,8 +4577,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -4805,8 +4729,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -4959,8 +4881,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -5113,8 +5033,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -5267,8 +5185,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -5421,8 +5337,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v2, v1, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -5578,8 +5492,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -5785,8 +5697,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -5991,8 +5901,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 ; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v4, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
@@ -6016,8 +5924,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -6220,8 +6126,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -6408,8 +6312,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; GFX11-SDAG-NEXT:    v_maxmin_f32 v3, v1, v2, v3
 ; GFX11-SDAG-NEXT:    v_minmax_f32 v1, v1, v2, v3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0:
@@ -6431,8 +6333,6 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
 ; GFX11-GISEL-NEXT:    v_minmax_f32 v1, v1, v2, v4
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -6603,8 +6503,6 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -6780,8 +6678,6 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -6957,8 +6853,6 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -7134,8 +7028,6 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_med3_f32 v1, -v1, v2, v3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
@@ -7155,8 +7047,6 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_f32 v1, v1, v2, v3
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -7347,8 +7237,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_minmax_f32 v1, -v1, v2, v3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
@@ -7370,8 +7258,6 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
 ; GFX11-GISEL-NEXT:    v_min_f32_e32 v4, v4, v2
 ; GFX11-GISEL-NEXT:    v_minmax_f32 v1, v1, v3, v4
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -7531,8 +7417,6 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_maxmin_f32 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -7659,8 +7543,6 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_med3_f16 v1, v1, 2.0, 4.0
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
@@ -7860,8 +7742,6 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_med3_f16 v1, v1, v2, v3
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
@@ -7988,8 +7868,6 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_maxmin_f32 v1, v1, s2, 0x41800000
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: two_non_inline_constant:
@@ -8005,8 +7883,6 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ; GFX11-GISEL-NEXT:    v_add_f32_e32 v1, 0.5, v1
 ; GFX11-GISEL-NEXT:    v_maxmin_f32 v1, v1, 0x41000000, v2
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -8144,8 +8020,6 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -8329,8 +8203,6 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use:
@@ -8352,8 +8224,6 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 2025ddb07e83a6..714040405bf67b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -123,8 +123,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -255,8 +253,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_min3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -394,8 +390,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -534,8 +528,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -743,8 +735,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
@@ -890,8 +880,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 3271758f712971..0353fc4f2b91b8 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -273,8 +273,6 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_minimum_f32 v1, v1, v2
 ; GCN-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -297,8 +295,6 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    v_minimum_f16 v1, v1, v2
 ; GCN-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 4
   %b = load volatile half, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index c60b9858abd836..57a960207180c6 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -59,8 +59,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v0, v0
 ; GFX11-NEXT:    v_fma_f32 v0, -v1, v0, 1.0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a11 = fadd float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
@@ -128,8 +126,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %mul2 = fmul fast float %x, 2.0
@@ -183,8 +179,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
@@ -245,8 +239,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
@@ -296,8 +288,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %mul2 = fmul fast float %x, 2.0
@@ -346,8 +336,6 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
   %mul2 = fmul fast float %x, 2.0
@@ -449,8 +437,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; GFX11-DENORM-NEXT:    v_fma_f16 v0, -v1, v0, 1.0
 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16:
@@ -473,8 +459,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v0, 1.0, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
@@ -582,8 +566,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1] offset:2 dlc
 ; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16:
@@ -601,8 +583,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLUSH-NEXT:    global_store_b16 v1, v2, s[0:1] offset:2 dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
@@ -703,8 +683,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1] offset:2 dlc
 ; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16:
@@ -722,8 +700,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLUSH-NEXT:    global_store_b16 v1, v2, s[0:1] offset:2 dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
@@ -833,8 +809,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[2:3] offset:2 dlc
 ; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
@@ -854,8 +828,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLUSH-NEXT:    global_store_b16 v1, v0, s[2:3] offset:2 dlc
 ; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
@@ -909,8 +881,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; GFX11-NEXT:    v_mul_f16_e32 v0, s4, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
@@ -962,8 +932,6 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; GFX11-NEXT:    v_mul_f16_e32 v0, s4, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 7b8384f317c6c3..e12c854f03c627 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -76,8 +76,6 @@ define amdgpu_kernel void @fmul_f16(
 ; GFX11-NEXT:    s_mov_b32 s9, s5
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -163,8 +161,6 @@ define amdgpu_kernel void @fmul_f16_imm_a(
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) {
@@ -248,8 +244,6 @@ define amdgpu_kernel void @fmul_f16_imm_b(
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    v_mul_f16_e32 v0, 4.0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -362,8 +356,6 @@ define amdgpu_kernel void @fmul_v2f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -459,8 +451,6 @@ define amdgpu_kernel void @fmul_v2f16_imm_a(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_mul_f16 v0, 0x44004200, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) {
@@ -554,8 +544,6 @@ define amdgpu_kernel void @fmul_v2f16_imm_b(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_mul_f16 v0, 0x42004400, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -685,8 +673,6 @@ define amdgpu_kernel void @fmul_v4f16(
 ; GFX11-NEXT:    v_pk_mul_f16 v1, v3, v1
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -797,8 +783,6 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
 ; GFX11-NEXT:    v_pk_mul_f16 v1, 0x44004200, v1
 ; GFX11-NEXT:    v_pk_mul_f16 v0, 0x40004800, v0
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 2b556a0be2b16d..8298a925343ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -105,8 +105,6 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_f16:
@@ -121,8 +119,6 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
   %r0 = load half, ptr addrspace(1) %in1
@@ -235,8 +231,6 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
@@ -254,8 +248,6 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
@@ -270,8 +262,6 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
   %r0 = load half, ptr addrspace(1) %in1
@@ -369,8 +359,6 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
@@ -385,8 +373,6 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
   %r0 = load half, ptr addrspace(1) %in1
@@ -477,8 +463,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
@@ -494,8 +478,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -589,8 +571,6 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
@@ -606,8 +586,6 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -715,8 +693,6 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
@@ -734,8 +710,6 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
@@ -751,8 +725,6 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
                             ptr addrspace(1) %in1,
                             ptr addrspace(1) %in2) #0 {
@@ -863,8 +835,6 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
@@ -882,8 +852,6 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v2, v1
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
@@ -899,8 +867,6 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
                             ptr addrspace(1) %in1,
                             ptr addrspace(1) %in2) #0 {
@@ -997,8 +963,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
@@ -1014,8 +978,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -1109,8 +1071,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
@@ -1126,8 +1086,6 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -1223,8 +1181,6 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
@@ -1240,8 +1196,6 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -1337,8 +1291,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
@@ -1354,8 +1306,6 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -1487,8 +1437,6 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
@@ -1508,8 +1456,6 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
@@ -1527,8 +1473,6 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
@@ -1662,8 +1606,6 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
@@ -1683,8 +1625,6 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
@@ -1702,8 +1642,6 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
@@ -1837,8 +1775,6 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
@@ -1858,8 +1794,6 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
@@ -1877,8 +1811,6 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
@@ -2013,8 +1945,6 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
@@ -2034,8 +1964,6 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
@@ -2053,8 +1981,6 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
@@ -2189,8 +2115,6 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
@@ -2210,8 +2134,6 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
@@ -2229,8 +2151,6 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
@@ -2366,8 +2286,6 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
@@ -2387,8 +2305,6 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
@@ -2406,8 +2322,6 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
@@ -2520,8 +2434,6 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
@@ -2539,8 +2451,6 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v2, v1
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
@@ -2556,8 +2466,6 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -2667,8 +2575,6 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
@@ -2686,8 +2592,6 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v2
 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT:    s_nop 0
-; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
@@ -2703,8 +2607,6 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
-; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index ba8b6fb80518fc..435d5bd79d095c 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -60,8 +60,6 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_rndne_f16_e32 v1, s4
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %1 = call half @llvm.nearbyint.f16(half %in)
   store half %1, ptr addrspace(1) %out
@@ -100,8 +98,6 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_rndne_f32_e32 v1, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call float @llvm.nearbyint.f32(float %in)
@@ -142,8 +138,6 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; GFX11-NEXT:    v_rndne_f32_e32 v1, s3
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
@@ -192,8 +186,6 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
 ; GFX11-NEXT:    v_rndne_f32_e32 v1, s5
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, s4
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
@@ -253,8 +245,6 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_rndne_f64_e32 v[0:1], s[2:3]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call double @llvm.nearbyint.f64(double %in)
@@ -329,8 +319,6 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
 ; GFX11-NEXT:    v_rndne_f64_e32 v[2:3], s[6:7]
 ; GFX11-NEXT:    v_rndne_f64_e32 v[0:1], s[4:5]
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
@@ -437,8 +425,6 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 8267bb9f5450f8..b821f9968490e3 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -57,8 +57,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_sub_f16_e64 v1, s2, |s4|
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
@@ -121,8 +119,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mul_f16_e64 v1, s2, -|s4|
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
@@ -180,8 +176,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %bc = bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
@@ -236,8 +230,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call half @llvm.fabs.f16(half %in)
   %fsub = fsub half -0.0, %fabs
@@ -280,8 +272,6 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in, align 2
   %fabs = call half @llvm.fabs.f16(half %val)
@@ -352,8 +342,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = fadd <2 x half> %in, <half 1.0, half 2.0>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %add)
@@ -410,8 +398,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
@@ -455,8 +441,6 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
@@ -523,8 +507,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
@@ -596,8 +578,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
@@ -678,8 +658,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 98b17bbaa0a959..4412e04e121a96 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1535,8 +1535,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, s2, -v0, vcc_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = select i1 %z, double %x, double %y
   %b = fneg double %a
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 40982347f3ca00..b9dd2727b36784 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -51,8 +51,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = fsub half -0.0, %in
   store half %fneg, ptr addrspace(1) %out
@@ -112,8 +110,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid
@@ -170,8 +166,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %bc = bitcast i16 %in to half
   %fsub = fsub half -0.0, %bc
@@ -231,8 +225,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v1, -v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   %fsub = fsub half -0.0, %val
@@ -287,8 +279,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
   store <2 x half> %fneg, ptr addrspace(1) %out
@@ -334,8 +324,6 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %in = call i32 asm sideeffect "; def $0", "=s"()
   %in.bc = bitcast i32 %in to <2 x half>
@@ -395,8 +383,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
@@ -453,8 +439,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %bc = bitcast i32 %in to <2 x half>
   %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc
@@ -525,8 +509,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val
@@ -605,8 +587,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
@@ -663,8 +643,6 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index e447429539e6ff..d8809132883a9e 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -39,8 +39,6 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, ptr addrspace(1) %out
@@ -86,8 +84,6 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, ptr addrspace(1) %out
@@ -146,8 +142,6 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, ptr addrspace(1) %out
@@ -186,8 +180,6 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_f32_e64 v1, 0, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %bc = bitcast i32 %in to float
   %fsub = fsub float 0.0, %bc
@@ -230,8 +222,6 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %bc = bitcast i32 %in to float
   %fsub = fsub float -0.0, %bc
@@ -271,8 +261,6 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e64 v1, -s4, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fsub = fsub float -0.0, %in
   %fmul = fmul float %fsub, %in
@@ -313,8 +301,6 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e64 v1, s4, -4.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %in.bc = bitcast float %in to i32
   %int.abs = xor i32 %in.bc, 2147483648
@@ -359,8 +345,6 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = xor i32 %in, -2147483648
   store i32 %fneg, ptr addrspace(1) %out
@@ -409,8 +393,6 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_f32_e64 v1, 2.0, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = xor i32 %in, -2147483648
   %bitcast = bitcast i32 %fneg to float
@@ -467,8 +449,6 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = xor i64 %in, -9223372036854775808
   store i64 %fneg, ptr addrspace(1) %out
@@ -515,8 +495,6 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f64 v[0:1], -s[2:3], 2.0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = xor i64 %in, -9223372036854775808
   %bitcast = bitcast i64 %fneg to double
@@ -582,8 +560,6 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_f16_e64 v1, 2.0, s4
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fneg = xor i16 %in, -32768
   %bitcast = bitcast i16 %fneg to half
@@ -657,8 +633,6 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %in = bitcast i32 %arg to <2 x i16>
   %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
@@ -741,8 +715,6 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %in = bitcast i32 %arg to <2 x i16>
   %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index e94e2ee9f37d8e..a0578756433ff8 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -11,15 +11,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val,
 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
   ret void
@@ -29,15 +25,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i
 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -96,15 +88,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2f16_noret:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -141,15 +129,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val,
 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_noret:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -159,15 +143,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x bfloat> %val, <4 x i32>
 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2bf16:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index fb731cc00d3f01..038e7b4f5e2bb8 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -44,8 +44,6 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
@@ -92,8 +90,6 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
@@ -129,8 +125,6 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
@@ -177,8 +171,6 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -227,8 +219,6 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #3
   %cmpinf = fcmp one float %x.fabs, 0x7FF0000000000000
@@ -274,8 +264,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -331,8 +319,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
   %ninf = fcmp une float %x, 0x7FF0000000000000
@@ -385,8 +371,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
@@ -442,8 +426,6 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp uno float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -492,8 +474,6 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -542,8 +522,6 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -603,8 +581,6 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, %y
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -654,8 +630,6 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fabs = tail call half @llvm.fabs.f16(half %x) #1
   %cmp = fcmp oeq half %fabs, 0xH7C00
@@ -706,8 +680,6 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord half %x, 0.0
   %x.fabs = tail call half @llvm.fabs.f16(half %x) #1
@@ -760,8 +732,6 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord half %x, 0.0
   %x.fabs = tail call half @llvm.fabs.f16(half %x) #1
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 742aeb96fcc2c8..626a22653f7def 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -68,8 +68,6 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32:
@@ -80,8 +78,6 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32:
@@ -134,8 +130,6 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; G_GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -180,8 +174,6 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32:
@@ -189,8 +181,6 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32:
@@ -230,8 +220,6 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; G_GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -453,8 +441,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32:
@@ -465,8 +451,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32:
@@ -519,8 +503,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -565,8 +547,6 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32:
@@ -574,8 +554,6 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32:
@@ -615,8 +593,6 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -688,8 +664,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
@@ -702,8 +676,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
@@ -767,8 +739,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ; G_GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index 950d228f299298..ba3735c1186105 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -66,8 +66,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
@@ -120,8 +118,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; G_GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -166,8 +162,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f32(ptr addrspace(8) inreg
 ; GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32:
@@ -207,8 +201,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f32(ptr addrspace(8) inreg
 ; G_GFX1100-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -417,8 +409,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
@@ -471,8 +461,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
 ; G_GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX1100-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -517,8 +505,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f32(ptr addrspace(8) inreg
 ; GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32:
@@ -558,8 +544,6 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f32(ptr addrspace(8) inreg
 ; G_GFX1100-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1100-NEXT:    global_store_b32 v[0:1], v0, off
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -631,8 +615,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GFX1100-NEXT:    s_nop 0
-; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
 ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc:
@@ -696,8 +678,6 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp
 ; G_GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX1100-NEXT:    global_store_b32 v1, v0, s[6:7]
-; G_GFX1100-NEXT:    s_nop 0
-; G_GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
index abe7a5cc43f4f8..a55c3d8c13df4c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
@@ -9,8 +9,6 @@ define amdgpu_cs void @global_atomic_fmin_num_f32_noret(ptr addrspace(1) %ptr, f
 ; GFX12-LABEL: global_atomic_fmin_num_f32_noret:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
@@ -20,8 +18,6 @@ define amdgpu_cs void @global_atomic_fmax_num_f32_noret(ptr addrspace(1) %ptr, f
 ; GFX12-LABEL: global_atomic_fmax_num_f32_noret:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
@@ -33,8 +29,6 @@ define amdgpu_cs void @global_atomic_fmax_num_f32_rtn(ptr addrspace(1) %ptr, flo
 ; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   store float %ret, ptr addrspace(1) %out
@@ -47,8 +41,6 @@ define amdgpu_cs void @global_atomic_fmin_num_f32_rtn(ptr addrspace(1) %ptr, flo
 ; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   store float %ret, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 064e88873a175a..a169737493bcf7 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -61,8 +61,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp32:
@@ -81,8 +79,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; CYPRESS-LABEL: test_convert_fp16_to_fp32:
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index 6c9f451167b7b0..865d64605f65c3 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -64,8 +64,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp64:
@@ -86,8 +84,6 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 5bac710070477f..47b195a8f17067 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -60,8 +60,6 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16:
@@ -80,8 +78,6 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; CYPRESS-LABEL: test_convert_fp32_to_fp16:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index a40d678e84d728..d2e6b9266fa5c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -76,8 +76,6 @@ define amdgpu_kernel void @fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fpext_f16_to_f32:
@@ -96,8 +94,6 @@ define amdgpu_kernel void @fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -184,8 +180,6 @@ define amdgpu_kernel void @fpext_f16_to_f64(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fpext_f16_to_f64:
@@ -206,8 +200,6 @@ define amdgpu_kernel void @fpext_f16_to_f64(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -296,8 +288,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32:
@@ -319,8 +309,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -418,8 +406,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64:
@@ -444,8 +430,6 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -491,8 +475,6 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32:
@@ -505,8 +487,6 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, s4
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %a.trunc = trunc i32 %a to i16
@@ -587,8 +567,6 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32:
@@ -607,8 +585,6 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -691,8 +667,6 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32:
@@ -711,8 +685,6 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -795,8 +767,6 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32:
@@ -815,8 +785,6 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -920,8 +888,6 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32:
@@ -944,8 +910,6 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -1049,8 +1013,6 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
@@ -1073,8 +1035,6 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -1177,8 +1137,6 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32:
@@ -1201,8 +1159,6 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -1306,8 +1262,6 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
@@ -1330,8 +1284,6 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -1434,8 +1386,6 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
@@ -1458,8 +1408,6 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -1564,8 +1512,6 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
@@ -1588,8 +1534,6 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 327f2653c47462..9e92a89501cf6b 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -59,8 +59,6 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptosi_f16_to_i16:
@@ -79,8 +77,6 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_i16_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -148,8 +144,6 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptosi_f16_to_i32:
@@ -170,8 +164,6 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -245,8 +237,6 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
 ; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptosi_f16_to_i64:
@@ -268,8 +258,6 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
 ; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -351,8 +339,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16:
@@ -377,8 +363,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -455,8 +439,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32:
@@ -481,8 +463,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -569,8 +549,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64:
@@ -598,8 +576,6 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -649,8 +625,6 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, -1.0, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptosi_f16_to_i1:
@@ -665,8 +639,6 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %conv = fptosi half %in to i1
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index ba540f4948b50a..804208998f9e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -59,8 +59,6 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptoui_f16_to_i16:
@@ -79,8 +77,6 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_u16_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -148,8 +144,6 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptoui_f16_to_i32:
@@ -170,8 +164,6 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -245,8 +237,6 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptoui_f16_to_i64:
@@ -268,8 +258,6 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -350,8 +338,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16:
@@ -376,8 +362,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -454,8 +438,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32:
@@ -480,8 +462,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -567,8 +547,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64:
@@ -595,8 +573,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -647,8 +623,6 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 1.0, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fptoui_f16_to_i1:
@@ -663,8 +637,6 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %conv = fptoui half %in to i1
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 0817ac1b3cd677..0ea412a6b6f131 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -115,8 +115,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16:
@@ -129,8 +127,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -256,8 +252,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
@@ -272,8 +266,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -409,8 +401,6 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
@@ -426,8 +416,6 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -578,8 +566,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
@@ -598,8 +584,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -717,8 +701,6 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
@@ -731,8 +713,6 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -851,8 +831,6 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
@@ -865,8 +843,6 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s2|
 ; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -985,8 +961,6 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -|v0|
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
@@ -999,8 +973,6 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -|s2|
 ; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -1122,8 +1094,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
@@ -1138,8 +1108,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -1261,8 +1229,6 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
@@ -1277,8 +1243,6 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -1407,8 +1371,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
@@ -1423,8 +1385,6 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 1ba5e8f916cbaa..7a18e2ef7b4a84 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -76,8 +76,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_f64_to_f32:
@@ -88,8 +86,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %result = fptrunc double %in to float
   store float %result, ptr addrspace(1) %out
@@ -483,8 +479,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX11-SAFE-SDAG-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-SAFE-SDAG-NEXT:    s_nop 0
-; GFX11-SAFE-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SAFE-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
@@ -542,8 +536,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX11-SAFE-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-SAFE-GISEL-NEXT:    s_nop 0
-; GFX11-SAFE-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SAFE-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
@@ -556,8 +548,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-UNSAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-UNSAFE-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-UNSAFE-SDAG-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-UNSAFE-SDAG-NEXT:    s_nop 0
-; GFX11-UNSAFE-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-UNSAFE-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
@@ -570,8 +560,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-UNSAFE-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-UNSAFE-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-UNSAFE-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-UNSAFE-GISEL-NEXT:    s_nop 0
-; GFX11-UNSAFE-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-UNSAFE-GISEL-NEXT:    s_endpgm
   %result = fptrunc double %in to half
   %result_i16 = bitcast half %result to i16
@@ -653,8 +641,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
 ; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX11-SDAG-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
@@ -668,8 +654,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %result = fptrunc <2 x double> %in to <2 x float>
   store <2 x float> %result, ptr addrspace(1) %out
@@ -763,8 +747,6 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
@@ -779,8 +761,6 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[8:9]
 ; GFX11-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %result = fptrunc <3 x double> %in to <3 x float>
   store <3 x float> %result, ptr addrspace(1) %out
@@ -873,8 +853,6 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
 ; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX11-SDAG-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
@@ -890,8 +868,6 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[8:9]
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v3, s[10:11]
 ; GFX11-GISEL-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %result = fptrunc <4 x double> %in to <4 x float>
   store <4 x float> %result, ptr addrspace(1) %out
@@ -1015,8 +991,6 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
 ; GFX11-SDAG-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
@@ -1038,8 +1012,6 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %result = fptrunc <8 x double> %in to <8 x float>
   store <8 x float> %result, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index b3432c457d9a45..a92015269f8ce9 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -214,8 +214,6 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_f16:
@@ -253,8 +251,6 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -399,8 +395,6 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: fast_frem_f16:
@@ -422,8 +416,6 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
 ; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -568,8 +560,6 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: unsafe_frem_f16:
@@ -591,8 +581,6 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
 ; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -799,8 +787,6 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_f32:
@@ -837,8 +823,6 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -975,8 +959,6 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: fast_frem_f32:
@@ -998,8 +980,6 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -1136,8 +1116,6 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: unsafe_frem_f32:
@@ -1159,8 +1137,6 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -1380,8 +1356,6 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v12, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_f64:
@@ -1415,8 +1389,6 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1150-NEXT:    global_store_b64 v12, v[0:1], s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -1609,8 +1581,6 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: fast_frem_f64:
@@ -1640,8 +1610,6 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -1834,8 +1802,6 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: unsafe_frem_f64:
@@ -1865,8 +1831,6 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -2203,8 +2167,6 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    v_fma_f16 v1, -v1, v6, v4
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v2f16:
@@ -2268,8 +2230,6 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v3
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
@@ -2826,8 +2786,6 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v4f16:
@@ -2938,8 +2896,6 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_fmac_f16_e32 v1, v5, v3
 ; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
@@ -3242,8 +3198,6 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    v_fma_f32 v0, -v3, v2, v0
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v2f32:
@@ -3302,8 +3256,6 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX1150-NEXT:    v_fmac_f32_e32 v0, v3, v2
 ; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
@@ -3798,8 +3750,6 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX11-NEXT:    v_fma_f32 v0, -v5, v4, v0
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v4f32:
@@ -3902,8 +3852,6 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
 ; GFX1150-NEXT:    v_fmac_f32_e32 v0, v5, v4
 ; GFX1150-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
@@ -4233,8 +4181,6 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
 ; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v2f64:
@@ -4286,8 +4232,6 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
 ; GFX1150-NEXT:    global_store_b128 v16, v[0:3], s[4:5]
-; GFX1150-NEXT:    s_nop 0
-; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 4f230140f7ba4d..8ad20d3da9a96c 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -99,8 +99,6 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, s2, v0, s3
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -171,8 +169,6 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 25
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
@@ -303,8 +299,6 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, v0, s1
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, v3, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -390,8 +384,6 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s7, 23
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s6, 25
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
@@ -582,8 +574,6 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, v5, s9
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, v6, s8
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
@@ -689,8 +679,6 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s9, 25
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s8, 31
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
@@ -782,8 +770,6 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %shl = shl i32 %a, 7
   %xor = xor i32 %shl, %b
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 31f574d44ab8c2..551af1aa5cf7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -91,8 +91,6 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s5, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
@@ -163,8 +161,6 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 7
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
@@ -265,8 +261,6 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s7, v0
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s6, v2
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -352,8 +346,6 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s7, 9
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s6, 7
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
@@ -484,8 +476,6 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s9, v4
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s8, v5
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
@@ -589,8 +579,6 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s9, 7
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s8, 1
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index 2a79793443fb28..5495f0a8b0b714 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -76,8 +76,6 @@ define amdgpu_kernel void @fsub_f16(
 ; GFX11-NEXT:    s_mov_b32 s9, s5
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -163,8 +161,6 @@ define amdgpu_kernel void @fsub_f16_imm_a(
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    v_sub_f16_e32 v0, 1.0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) {
@@ -248,8 +244,6 @@ define amdgpu_kernel void @fsub_f16_imm_b(
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    v_add_f16_e32 v0, -2.0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -362,8 +356,6 @@ define amdgpu_kernel void @fsub_v2f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -459,8 +451,6 @@ define amdgpu_kernel void @fsub_v2f16_imm_a(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) {
@@ -554,8 +544,6 @@ define amdgpu_kernel void @fsub_v2f16_imm_b(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v0, 0xbc00c000, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 08da89ec0fb229..020c9dc130bb2a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -9,8 +9,6 @@ define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %ld = load i8, ptr addrspace(4) %in
   %sext = sext i8 %ld to i32
@@ -29,8 +27,6 @@ define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrsp
 ; DAG-NEXT:    s_wait_kmcnt 0x0
 ; DAG-NEXT:    v_mov_b32_e32 v2, s0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_s_load_i8_imm:
@@ -41,8 +37,6 @@ define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrsp
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(4) %in, i64 -100
   %ld = load i8, ptr addrspace(4) %gep
@@ -58,8 +52,6 @@ define amdgpu_ps void @test_s_load_i8_sgpr(ptr addrspace(4) inreg %in, i32 inreg
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %zext = zext i32 %offset to i64
   %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext
@@ -76,8 +68,6 @@ define amdgpu_ps void @test_s_load_i8_sgpr_imm(ptr addrspace(4) inreg %in, i32 i
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
   %zext = zext i32 %offset to i64
@@ -94,8 +84,6 @@ define amdgpu_ps void @test_s_load_i8_divergent(ptr addrspace(4) inreg %in, i32
 ; GCN-NEXT:    global_load_i8 v0, v0, s[0:1] offset:16
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_store_b32 v[1:2], v0, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
   %zext = zext i32 %offset to i64
@@ -113,8 +101,6 @@ define amdgpu_ps void @test_s_load_u8(ptr addrspace(4) inreg %in, ptr addrspace(
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %ld = load i8, ptr addrspace(4) %in
   %zext = zext i8 %ld to i32
@@ -129,8 +115,6 @@ define amdgpu_ps void @test_s_load_u8_imm(ptr addrspace(4) inreg %in, ptr addrsp
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(4) %in, i64 255
   %ld = load i8, ptr addrspace(4) %gep
@@ -146,8 +130,6 @@ define amdgpu_ps void @test_s_load_u8_sgpr(ptr addrspace(4) inreg %in, i32 inreg
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %zext1 = zext i32 %offset to i64
   %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1
@@ -164,8 +146,6 @@ define amdgpu_ps void @test_s_load_u8_sgpr_imm(ptr addrspace(4) inreg %in, i32 i
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
   %zext1= zext i32 %offset to i64
@@ -182,8 +162,6 @@ define amdgpu_ps void @test_s_load_u8_divergent(ptr addrspace(4) inreg %in, i32
 ; GCN-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_store_b32 v[1:2], v0, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %gep1 = getelementptr i8, ptr addrspace(4) %in, i64 16
   %zext1= zext i32 %offset to i64
@@ -201,8 +179,6 @@ define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %ld = load i16, ptr addrspace(4) %in
   %sext = sext i16 %ld to i32
@@ -221,8 +197,6 @@ define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrs
 ; DAG-NEXT:    s_wait_kmcnt 0x0
 ; DAG-NEXT:    v_mov_b32_e32 v2, s0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_s_load_i16_imm:
@@ -233,8 +207,6 @@ define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrs
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %gep = getelementptr i16, ptr addrspace(4) %in, i64 -100
   %ld = load i16, ptr addrspace(4) %gep
@@ -250,8 +222,6 @@ define amdgpu_ps void @test_s_load_i16_sgpr(ptr addrspace(4) inreg %in, i32 inre
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %zext = zext i32 %offset to i64
   %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext
@@ -272,8 +242,6 @@ define amdgpu_ps void @test_s_load_i16_sgpr_imm(ptr addrspace(4) inreg %in, i32
 ; DAG-NEXT:    s_wait_kmcnt 0x0
 ; DAG-NEXT:    v_mov_b32_e32 v2, s0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_s_load_i16_sgpr_imm:
@@ -287,8 +255,6 @@ define amdgpu_ps void @test_s_load_i16_sgpr_imm(ptr addrspace(4) inreg %in, i32
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
   %zext = zext i32 %offset to i64
@@ -311,8 +277,6 @@ define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32
 ; DAG-NEXT:    global_load_i16 v0, v[3:4], off offset:32
 ; DAG-NEXT:    s_wait_loadcnt 0x0
 ; DAG-NEXT:    global_store_b32 v[1:2], v0, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_s_load_i16_divergent:
@@ -328,8 +292,6 @@ define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32
 ; GISEL-NEXT:    global_load_i16 v0, v[0:1], off offset:32
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    global_store_b32 v[3:4], v0, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
   %zext = zext i32 %offset to i64
@@ -347,8 +309,6 @@ define amdgpu_ps void @test_s_load_u16(ptr addrspace(4) inreg %in, ptr addrspace
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %ld = load i16, ptr addrspace(4) %in
   %zext = zext i16 %ld to i32
@@ -363,8 +323,6 @@ define amdgpu_ps void @test_s_load_u16_imm(ptr addrspace(4) inreg %in, ptr addrs
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %gep = getelementptr i16, ptr addrspace(4) %in, i64 255
   %ld = load i16, ptr addrspace(4) %gep
@@ -380,8 +338,6 @@ define amdgpu_ps void @test_s_load_u16_sgpr(ptr addrspace(4) inreg %in, i32 inre
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
   %zext1 = zext i32 %offset to i64
   %gep = getelementptr i8, ptr addrspace(4) %in, i64 %zext1
@@ -402,8 +358,6 @@ define amdgpu_ps void @test_s_load_u16_sgpr_imm(ptr addrspace(4) inreg %in, i32
 ; DAG-NEXT:    s_wait_kmcnt 0x0
 ; DAG-NEXT:    v_mov_b32_e32 v2, s0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_s_load_u16_sgpr_imm:
@@ -417,8 +371,6 @@ define amdgpu_ps void @test_s_load_u16_sgpr_imm(ptr addrspace(4) inreg %in, i32
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
   %zext1= zext i32 %offset to i64
@@ -441,8 +393,6 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32
 ; DAG-NEXT:    global_load_u16 v0, v[3:4], off offset:32
 ; DAG-NEXT:    s_wait_loadcnt 0x0
 ; DAG-NEXT:    global_store_b32 v[1:2], v0, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_s_load_u16_divergent:
@@ -458,8 +408,6 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32
 ; GISEL-NEXT:    global_load_u16 v0, v[0:1], off offset:32
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    global_store_b32 v[3:4], v0, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %gep1 = getelementptr i16, ptr addrspace(4) %in, i64 16
   %zext1= zext i32 %offset to i64
@@ -477,8 +425,6 @@ define amdgpu_ps void @s_buffer_load_byte_imm_offset(<4 x i32> inreg %src, ptr a
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 4, i32 0)
@@ -494,8 +440,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr(<4 x i32> inreg %src, ptr addrspa
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
@@ -511,8 +455,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset(<4 x i32> inreg %sr
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
@@ -528,8 +470,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32>
 ; DAG-NEXT:    buffer_load_i8 v2, v2, s[0:3], null offen
 ; DAG-NEXT:    s_wait_loadcnt 0x0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent:
@@ -537,8 +477,6 @@ define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32>
 ; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0)
@@ -556,8 +494,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_imm_offset(<4 x i32> inreg %src, ptr
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 4, i32 0)
@@ -575,8 +511,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr(<4 x i32> inreg %src, ptr addrsp
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
@@ -594,8 +528,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset(<4 x i32> inreg %s
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
@@ -611,8 +543,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32
 ; DAG-NEXT:    buffer_load_u8 v2, v2, s[0:3], null offen
 ; DAG-NEXT:    s_wait_loadcnt 0x0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent:
@@ -621,8 +551,6 @@ define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 main_body:
   %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0)
@@ -638,8 +566,6 @@ define amdgpu_ps void @s_buffer_load_short_imm_offset(<4 x i32> inreg %src, ptr
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 4, i32 0)
@@ -655,8 +581,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr(<4 x i32> inreg %src, ptr addrsp
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
@@ -672,8 +596,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset(<4 x i32> inreg %s
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
@@ -689,8 +611,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32
 ; DAG-NEXT:    buffer_load_i16 v2, v2, s[0:3], null offen
 ; DAG-NEXT:    s_wait_loadcnt 0x0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent:
@@ -698,8 +618,6 @@ define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32
 ; GISEL-NEXT:    buffer_load_b32 v2, v2, s[0:3], null offen
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0)
@@ -717,8 +635,6 @@ define amdgpu_ps void @s_buffer_load_ushort_imm_offset(<4 x i32> inreg %src, ptr
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 4, i32 0)
@@ -736,8 +652,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr(<4 x i32> inreg %src, ptr addrs
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
@@ -755,8 +669,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset(<4 x i32> inreg %
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 main_body:
   %off = add nuw nsw i32 %in, 100
@@ -772,8 +684,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i3
 ; DAG-NEXT:    buffer_load_u16 v2, v2, s[0:3], null offen
 ; DAG-NEXT:    s_wait_loadcnt 0x0
 ; DAG-NEXT:    global_store_b32 v[0:1], v2, off
-; DAG-NEXT:    s_nop 0
-; DAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent:
@@ -782,8 +692,6 @@ define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i3
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 main_body:
   %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 059f925ee99a42..dcd366e779449e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -23054,8 +23054,6 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:  .LBB92_2:
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: infer_as_before_atomic:
@@ -23096,8 +23094,6 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX11-NEXT:  .LBB92_2:
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: infer_as_before_atomic:
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index a3c8bb141fd93b..fb72dcacee4cf5 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -2938,8 +2938,6 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-LABEL: global_inc_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2956,8 +2954,6 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_inc_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3014,8 +3010,6 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-LABEL: global_inc_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_inc_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3032,8 +3026,6 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_inc_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3095,8 +3087,6 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-LABEL: global_dec_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3113,8 +3103,6 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_dec_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3171,8 +3159,6 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-LABEL: global_dec_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_dec_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3189,8 +3175,6 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_dec_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
index 790056b320d8cb..0b061da575a2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
@@ -19,8 +19,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr:
@@ -28,8 +26,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
@@ -52,8 +48,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3] offset:2047
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047:
@@ -61,8 +55,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3] offset:2047
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
@@ -86,8 +78,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3] offset:-2048
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048:
@@ -95,8 +85,6 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrsp
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v2, s[2:3] offset:-2048
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
@@ -143,8 +131,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs:
@@ -155,8 +141,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -196,8 +180,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1] offset:-120
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset:
@@ -208,8 +190,6 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1] offset:-120
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
@@ -232,15 +212,11 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-LABEL: global_store_saddr_i16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -257,15 +233,11 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -283,15 +255,11 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-LABEL: global_store_saddr_f16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_f16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -308,15 +276,11 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -334,15 +298,11 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-LABEL: global_store_saddr_i32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -359,15 +319,11 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -385,15 +341,11 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-LABEL: global_store_saddr_f32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_f32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -410,15 +362,11 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -436,15 +384,11 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX11-LABEL: global_store_saddr_p3_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_p3_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -461,15 +405,11 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(ptr addrspa
 ; GFX11-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -487,15 +427,11 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-LABEL: global_store_saddr_i64_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i64_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -512,15 +448,11 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -538,15 +470,11 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(ptr addrspace(1) inreg %
 ; GFX11-LABEL: global_store_saddr_f64_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_f64_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -563,15 +491,11 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(ptr addrsp
 ; GFX11-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -589,15 +513,11 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -614,15 +534,11 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -640,15 +556,11 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -665,15 +577,11 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -691,15 +599,11 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -716,15 +620,11 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -742,15 +642,11 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -767,15 +663,11 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -793,15 +685,11 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(ptr addrspace(1) inreg %s
 ; GFX11-LABEL: global_store_saddr_p1_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_p1_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -818,15 +706,11 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(ptr addrspa
 ; GFX11-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -844,15 +728,11 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -869,15 +749,11 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -895,15 +771,11 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -920,15 +792,11 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -946,15 +814,11 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -971,15 +835,11 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -997,15 +857,11 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1022,15 +878,11 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b96 v0, v[1:3], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1048,15 +900,11 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1073,15 +921,11 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1099,15 +943,11 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1124,15 +964,11 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1150,15 +986,11 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1175,15 +1007,11 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1201,15 +1029,11 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1226,15 +1050,11 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1252,15 +1072,11 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1277,15 +1093,11 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1303,15 +1115,11 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1328,15 +1136,11 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr addr
 ; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1354,15 +1158,11 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1379,15 +1179,11 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr addrs
 ; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1405,15 +1201,11 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(ptr addrspace(1) inreg
 ; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1430,15 +1222,11 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr addrs
 ; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1460,16 +1248,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(ptr addrspace(1)
 ; GFX11-LABEL: atomic_global_store_saddr_i32_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1486,16 +1270,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(ptr
 ; GFX11-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1513,16 +1293,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(ptr addrspace(1)
 ; GFX11-LABEL: atomic_global_store_saddr_i64_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1539,16 +1315,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(ptr
 ; GFX11-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1570,15 +1342,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(ptr addrspace(1) i
 ; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_d16_hi_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_d16_hi_b16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1596,15 +1364,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr
 ; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1623,15 +1387,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr addrsp
 ; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_d16_hi_b8 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_d16_hi_b8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -1650,15 +1410,11 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg
 ; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index ba2d48166b2e45..24c08ec86051f8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -124,8 +124,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -283,8 +281,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -417,8 +413,6 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -570,8 +564,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -700,8 +692,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -859,8 +849,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -993,8 +981,6 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -1146,8 +1132,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -1276,8 +1260,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -1435,8 +1417,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -1569,8 +1549,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -1722,8 +1700,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -1843,8 +1819,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -1993,8 +1967,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2118,8 +2090,6 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -2262,8 +2232,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2383,8 +2351,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -2533,8 +2499,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2658,8 +2622,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -2802,8 +2764,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -2923,8 +2883,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -3073,8 +3031,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -3198,8 +3154,6 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -3342,8 +3296,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -3463,8 +3415,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -3613,8 +3563,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -3738,8 +3686,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
@@ -3882,8 +3828,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -4012,8 +3956,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -4171,8 +4113,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -4305,8 +4245,6 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -4458,8 +4396,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -4698,8 +4634,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -4857,8 +4791,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -4991,8 +4923,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -5144,8 +5074,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5274,8 +5202,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -5433,8 +5359,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5567,8 +5491,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
@@ -5720,8 +5642,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5941,8 +5861,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -6116,8 +6034,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6265,8 +6181,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
@@ -6434,8 +6348,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[6:7]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6500,8 +6412,6 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %in, i64 4
@@ -6567,8 +6477,6 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %in, i64 -4
@@ -6630,8 +6538,6 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load atomic i64, ptr addrspace(1) %in syncscope("agent") seq_cst, align 8
@@ -6711,8 +6617,6 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
@@ -6792,8 +6696,6 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
@@ -6874,8 +6776,6 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr addrspace(1) %in, i64 %index
@@ -6930,8 +6830,6 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3] offset:32 scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -6982,8 +6880,6 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3] scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   store atomic i64 %in, ptr addrspace(1) %out seq_cst, align 8
@@ -7051,8 +6947,6 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -7120,8 +7014,6 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -7190,8 +7082,6 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[6:7], s[0:1]
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr double, ptr addrspace(1) %out, i64 %index
@@ -7320,8 +7210,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
@@ -7525,8 +7413,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index c3a197ce99859f..fa0ab4537fcd76 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -166,8 +166,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1164-NEXT:  .LBB0_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -187,8 +185,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1132-NEXT:  .LBB0_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -341,8 +337,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1164-DPP-NEXT:  .LBB0_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -362,8 +356,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1132-DPP-NEXT:  .LBB0_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -14014,8 +14006,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1164-NEXT:  .LBB18_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -14035,8 +14025,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1132-NEXT:  .LBB18_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -14189,8 +14177,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1164-DPP-NEXT:  .LBB18_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -14210,8 +14196,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1132-DPP-NEXT:  .LBB18_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -14368,8 +14352,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1164-NEXT:  .LBB19_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -14389,8 +14371,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1132-NEXT:  .LBB19_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -14543,8 +14523,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1164-DPP-NEXT:  .LBB19_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -14564,8 +14542,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
 ; GFX1132-DPP-NEXT:  .LBB19_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 69c6adf0300c22..9b9dd744945b15 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -124,8 +124,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB0_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -141,8 +139,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB0_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -253,8 +249,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB0_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -270,8 +264,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB0_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1
   ret void
@@ -1158,8 +1150,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB2_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1175,8 +1165,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB2_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1287,8 +1275,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB2_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1304,8 +1290,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB2_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
   ret void
@@ -2194,8 +2178,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB4_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2211,8 +2193,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB4_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2323,8 +2303,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB4_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2340,8 +2318,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB4_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1
   ret void
@@ -8957,8 +8933,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB12_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -8974,8 +8948,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB12_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9086,8 +9058,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB12_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9103,8 +9073,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB12_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -9219,8 +9187,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB13_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9236,8 +9202,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB13_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9348,8 +9312,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB13_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9365,8 +9327,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB13_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index b7890f30f77603..21a65851db1d55 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -124,8 +124,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB0_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -141,8 +139,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB0_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -253,8 +249,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB0_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -270,8 +264,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB0_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1
   ret void
@@ -1158,8 +1150,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB2_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1175,8 +1165,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB2_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1287,8 +1275,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB2_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1304,8 +1290,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB2_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
   ret void
@@ -2194,8 +2178,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB4_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2211,8 +2193,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB4_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2323,8 +2303,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB4_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2340,8 +2318,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB4_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1
   ret void
@@ -8957,8 +8933,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB12_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -8974,8 +8948,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB12_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9086,8 +9058,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB12_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9103,8 +9073,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB12_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -9219,8 +9187,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-NEXT:  .LBB13_2:
-; GFX1164-NEXT:    s_nop 0
-; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9236,8 +9202,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-NEXT:  .LBB13_2:
-; GFX1132-NEXT:    s_nop 0
-; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9348,8 +9312,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB13_2:
-; GFX1164-DPP-NEXT:    s_nop 0
-; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9365,8 +9327,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB13_2:
-; GFX1132-DPP-NEXT:    s_nop 0
-; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 3735c6349fbb31..1109a0a25349f7 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -36,8 +36,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store half %arg, ptr addrspace(1) %out
   ret void
@@ -74,8 +72,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <2 x half> %arg, ptr addrspace(1) %out
   ret void
@@ -107,8 +103,6 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] offset:4
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <3 x half> %arg, ptr addrspace(1) %out
   ret void
@@ -135,8 +129,6 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <4 x half> %arg, ptr addrspace(1) %out
   ret void
@@ -181,8 +173,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   store <8 x half> %arg, ptr addrspace(1) %out
   ret void
@@ -226,8 +216,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fpext = fpext <2 x half> %in to <2 x float>
   store <2 x float> %fpext, ptr addrspace(1) %out
@@ -266,8 +254,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext half %arg to float
   store float %ext, ptr addrspace(1) %out
@@ -312,8 +298,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <2 x half> %arg to <2 x float>
   store <2 x float> %ext, ptr addrspace(1) %out
@@ -357,8 +341,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <3 x half> %arg to <3 x float>
   store <3 x float> %ext, ptr addrspace(1) %out
@@ -408,8 +390,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <4 x half> %arg to <4 x float>
   store <4 x float> %ext, ptr addrspace(1) %out
@@ -495,8 +475,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <8 x half> %arg to <8 x float>
   store <8 x float> %ext, ptr addrspace(1) %out
@@ -541,8 +519,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext half %arg to double
   store double %ext, ptr addrspace(1) %out
@@ -596,8 +572,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <2 x half> %arg to <2 x double>
   store <2 x double> %ext, ptr addrspace(1) %out
@@ -664,8 +638,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <3 x half> %arg to <3 x double>
   store <3 x double> %ext, ptr addrspace(1) %out
@@ -743,8 +715,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <4 x half> %arg to <4 x double>
   store <4 x double> %ext, ptr addrspace(1) %out
@@ -878,8 +848,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
 ; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ext = fpext <8 x half> %arg to <8 x double>
   store <8 x double> %ext, ptr addrspace(1) %out
@@ -908,8 +876,6 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   store half %val, ptr addrspace(1) %out
@@ -938,8 +904,6 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   store <2 x half> %val, ptr addrspace(1) %out
@@ -968,8 +932,6 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
 ; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <4 x half>, ptr addrspace(1) %in
   store <4 x half> %val, ptr addrspace(1) %out
@@ -998,8 +960,6 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <8 x half>, ptr addrspace(1) %in
   store <8 x half> %val, ptr addrspace(1) %out
@@ -1030,8 +990,6 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   %cvt = fpext half %val to float
@@ -1083,8 +1041,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   %cvt = fpext <2 x half> %val to <2 x float>
@@ -1139,8 +1095,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v4
 ; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <3 x half>, ptr addrspace(1) %in
   %cvt = fpext <3 x half> %val to <3 x float>
@@ -1200,8 +1154,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <4 x half>, ptr addrspace(1) %in
   %cvt = fpext <4 x half> %val to <4 x float>
@@ -1288,8 +1240,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v12, v[8:11], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v12, v[4:7], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <8 x half>, ptr addrspace(1) %in
   %cvt = fpext <8 x half> %val to <8 x float>
@@ -1445,8 +1395,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; GFX11-NEXT:    global_store_b128 v20, v[0:3], s[0:1] offset:32
 ; GFX11-NEXT:    global_store_b128 v20, v[12:15], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v20, v[8:11], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <16 x half>, ptr addrspace(1) %in
   %cvt = fpext <16 x half> %val to <16 x float>
@@ -1481,8 +1429,6 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   %cvt = fpext half %val to double
@@ -1541,8 +1487,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   %cvt = fpext <2 x half> %val to <2 x double>
@@ -1619,8 +1563,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <3 x half>, ptr addrspace(1) %in
   %cvt = fpext <3 x half> %val to <3 x double>
@@ -1706,8 +1648,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <4 x half>, ptr addrspace(1) %in
   %cvt = fpext <4 x half> %val to <4 x double>
@@ -1840,8 +1780,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
 ; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <8 x half>, ptr addrspace(1) %in
   %cvt = fpext <8 x half> %val to <8 x double>
@@ -2090,8 +2028,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; GFX11-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
 ; GFX11-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <16 x half>, ptr addrspace(1) %in
   %cvt = fpext <16 x half> %val to <16 x double>
@@ -2123,8 +2059,6 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %in
   %cvt = fptrunc float %val to half
@@ -2178,8 +2112,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <2 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <2 x float> %val to <2 x half>
@@ -2248,8 +2180,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1] offset:4
 ; GFX11-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <3 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <3 x float> %val to <3 x half>
@@ -2313,8 +2243,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v2, v3
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v5
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <4 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <4 x float> %val to <4 x half>
@@ -2413,8 +2341,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v4, v5
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <8 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <8 x float> %val to <8 x half>
@@ -2593,8 +2519,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load <16 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <16 x float> %val to <16 x half>
@@ -2644,8 +2568,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f16_e64 v1, s4, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
    %add = fadd half %a, %b
    store half %add, ptr addrspace(1) %out, align 4
@@ -2698,8 +2620,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_add_f16 v1, s2, s3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = fadd <2 x half> %a, %b
   store <2 x half> %add, ptr addrspace(1) %out, align 8
@@ -2773,8 +2693,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, v3
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %b_ptr = getelementptr <4 x half>, ptr addrspace(1) %in, i32 1
   %a = load <4 x half>, ptr addrspace(1) %in, align 16
@@ -2897,8 +2815,6 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
 ; GFX11-NEXT:    v_pk_add_f16 v1, s5, s9
 ; GFX11-NEXT:    v_pk_add_f16 v0, s4, s8
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = fadd <8 x half> %a, %b
   store <8 x half> %add, ptr addrspace(1) %out, align 32
@@ -2927,8 +2843,6 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   %val_int = bitcast half %val to i16
@@ -2958,8 +2872,6 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to half
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 5eb4d9b7a2beb7..35b6bfbee111fb 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -149,8 +149,6 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[10:11]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB0_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
@@ -310,8 +308,6 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[10:11]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB1_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
@@ -469,8 +465,6 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_cmpk_eq_i32 s3, 0x400
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
@@ -614,8 +608,6 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_cmpk_eq_i32 s2, 0x400
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB3_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
@@ -729,8 +721,6 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    global_store_b16 v4, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB4_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
@@ -851,8 +841,6 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB5_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
@@ -986,8 +974,6 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB6_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
@@ -1129,8 +1115,6 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB7_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
   br label %bb3
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index fee59455da4c86..108d85e024ad76 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -149,8 +149,6 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -360,8 +358,6 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -547,8 +543,6 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
 ; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
@@ -746,8 +740,6 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
@@ -940,8 +932,6 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
@@ -1211,8 +1201,6 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
 ; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
 ; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
@@ -1382,8 +1370,6 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -1564,8 +1550,6 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -1753,8 +1737,6 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -1923,8 +1905,6 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -2131,8 +2111,6 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -2339,8 +2317,6 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -2546,8 +2522,6 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -2741,8 +2715,6 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -2944,8 +2916,6 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -3182,8 +3152,6 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0]
 ; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -3442,8 +3410,6 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 2894ae76c0be4a..e146cea50fa451 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -147,8 +147,6 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -345,8 +343,6 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
 ; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -533,8 +529,6 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
 ; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
@@ -710,8 +704,6 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
 ; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                    ptr addrspace(1) %src2,
                                    ptr addrspace(1) nocapture %dst) {
@@ -879,8 +871,6 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
 ; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                                       ptr addrspace(1) %src2,
                                                       ptr addrspace(1) nocapture %dst) {
@@ -1059,8 +1049,6 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
 ; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                                         ptr addrspace(1) %src2,
                                                         ptr addrspace(1) nocapture %dst) {
@@ -1259,8 +1247,6 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
 ; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
@@ -1470,8 +1456,6 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_add3_u32 v0, s2, v2, v0
 ; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
@@ -1713,8 +1697,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
 ; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
@@ -1977,8 +1959,6 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
 ; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
 ; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
@@ -2164,8 +2144,6 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
@@ -2426,8 +2404,6 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
 ; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
@@ -2669,8 +2645,6 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v7, v0
 ; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
@@ -2836,8 +2810,6 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -3016,8 +2988,6 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -3203,8 +3173,6 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -3373,8 +3341,6 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -3580,8 +3546,6 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -3788,8 +3752,6 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -3997,8 +3959,6 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
 ; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -4191,8 +4151,6 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s2
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -4393,8 +4351,6 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -4630,8 +4586,6 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, s2
 ; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) %src3,
@@ -4874,8 +4828,6 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v1
 ; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -5081,8 +5033,6 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -5270,8 +5220,6 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -5461,8 +5409,6 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -5619,8 +5565,6 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[2:3]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -5820,8 +5764,6 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -6014,8 +5956,6 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -6194,8 +6134,6 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
 ; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
 ; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT:    s_nop 0
-; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
index 83b650e2d755e2..2824d5daea66de 100644
--- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@@ -63,8 +63,6 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
@@ -153,8 +151,6 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
@@ -243,8 +239,6 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
@@ -333,8 +327,6 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
@@ -423,8 +415,6 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
@@ -525,8 +515,6 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7:
@@ -624,8 +612,6 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index ecece35337a7a8..02a9169c0e6fae 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -27,8 +27,6 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_neg_0.0_i16:
@@ -75,8 +73,6 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_0.0_f16:
@@ -121,8 +117,6 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_imm_neg_0.0_f16:
@@ -167,8 +161,6 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_0.5_f16:
@@ -213,8 +205,6 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_0.5_f16:
@@ -259,8 +249,6 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_1.0_f16:
@@ -305,8 +293,6 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_1.0_f16:
@@ -351,8 +337,6 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_2.0_f16:
@@ -397,8 +381,6 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_2.0_f16:
@@ -443,8 +425,6 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_4.0_f16:
@@ -489,8 +469,6 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_4.0_f16:
@@ -535,8 +513,6 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_inv_2pi_f16:
@@ -581,8 +557,6 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out)
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_inv_2pi_f16:
@@ -627,8 +601,6 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_literal_imm_f16:
@@ -677,8 +649,6 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x00,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_0.0_f16:
@@ -733,8 +703,6 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe0,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_0.5_f16:
@@ -789,8 +757,6 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe2,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_neg_0.5_f16:
@@ -845,8 +811,6 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe4,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_1.0_f16:
@@ -901,8 +865,6 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe6,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_neg_1.0_f16:
@@ -957,8 +919,6 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe8,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_2.0_f16:
@@ -1013,8 +973,6 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xea,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_neg_2.0_f16:
@@ -1069,8 +1027,6 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xec,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_4.0_f16:
@@ -1125,8 +1081,6 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xee,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_neg_4.0_f16:
@@ -1193,8 +1147,6 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: commute_add_inline_imm_0.5_f16:
@@ -1275,8 +1227,6 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: commute_add_literal_f16:
@@ -1345,8 +1295,6 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x02,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_1_f16:
@@ -1401,8 +1349,6 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x04,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_2_f16:
@@ -1457,8 +1403,6 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x20,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_16_f16:
@@ -1525,8 +1469,6 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4a]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_neg_1_f16:
@@ -1606,8 +1548,6 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xfe,0xff,0x00,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_neg_2_f16:
@@ -1687,8 +1627,6 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xf0,0xff,0x00,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_neg_16_f16:
@@ -1756,8 +1694,6 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x7e,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_63_f16:
@@ -1812,8 +1748,6 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
 ; GFX11-NEXT:    v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x80,0x01,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: add_inline_imm_64_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 647870f0e08979..a5e0e5fdcb9a0d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -41,8 +41,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
@@ -109,8 +107,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@@ -192,8 +188,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s1
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt1 = extractelement <2 x i16> %vec, i32 1
@@ -260,8 +254,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
@@ -343,8 +335,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
@@ -442,8 +432,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s1
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
@@ -496,8 +484,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
@@ -563,8 +549,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
@@ -611,8 +595,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
@@ -657,8 +639,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
@@ -728,8 +708,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s2, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -806,8 +784,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, v1, s0, 0x7060302
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -881,8 +857,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, 53, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -957,8 +931,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, s2, v1, 0x5040100
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1030,8 +1002,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, -15, v1, 0x5040100
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1105,8 +1075,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s2, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1178,8 +1146,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, 53, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1253,8 +1219,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, s2, v1, 0x5040100
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1326,8 +1290,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, 35, v1, 0x5040100
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1415,8 +1377,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %idx = load volatile i32, ptr addrspace(4) %idx.ptr
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
@@ -1498,8 +1458,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, s0, 0x3e703e7, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1602,8 +1560,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, v1, 0x12341234, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1684,8 +1640,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, s0, v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1765,8 +1719,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, s0, v0, 0x5040100
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1847,8 +1799,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s0, v1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1928,8 +1878,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2010,8 +1958,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, s0, v1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2120,8 +2066,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; GFX11-NEXT:    v_bfi_b32 v1, v3, s0, v1
 ; GFX11-NEXT:    v_bfi_b32 v0, v2, s0, v0
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2224,8 +2168,6 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; GFX11-NEXT:    v_bfi_b32 v1, s1, s2, v1
 ; GFX11-NEXT:    v_bfi_b32 v0, s0, s2, v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2306,8 +2248,6 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2389,8 +2329,6 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v3, 0xffff, s0, v3
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2619,8 +2557,6 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2723,8 +2659,6 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -2827,8 +2761,6 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -3231,8 +3163,6 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 026a8d7da7080b..d26f0df49b0a83 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -574,8 +574,6 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %cmp = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
index b3079f5c978715..8ae571df670aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -68,8 +68,6 @@ define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1
 ; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index 1139271003119c..7f1c01a7a00085 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -46,8 +46,6 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %tmp1 = bitcast i32 %tmp0 to float
@@ -63,8 +61,6 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %tmp1 = bitcast i32 %tmp0 to float
@@ -78,8 +74,6 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %tmp1 = bitcast i32 %tmp0 to float
@@ -93,8 +87,6 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %tmp1 = bitcast i32 %tmp0 to float
@@ -108,8 +100,6 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr a
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %tmp1 = bitcast i32 %tmp0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index eed85345b3b1ce..7524c7cbda6cd4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -56,8 +56,6 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
   store <2 x half> %result, ptr addrspace(1) %out
@@ -117,8 +115,6 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
   store <2 x half> %result, ptr addrspace(1) %out
@@ -230,8 +226,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -312,8 +306,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -392,8 +384,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -493,8 +483,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -597,8 +585,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -701,8 +687,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -806,8 +790,6 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
index 29535726b7496e..081727c3b5e14f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
@@ -20,8 +20,6 @@ define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-NEXT:    ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_b32 v[1:2], v3, off
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
   store i32 %res, ptr addrspace(1) %out, align 4
@@ -43,8 +41,6 @@ define amdgpu_gs void @test_add_64_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-NEXT:    ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_b64 v[1:2], v[3:4], off
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
   store i64 %res, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
index 0ce3f85425e0a8..644ecf28fd32f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
@@ -11,8 +11,6 @@ define amdgpu_gs void @test_ds_bvh_stack(i32 %addr, i32 %data0, <4 x i32> %data1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; CHECK-NEXT:    global_store_b32 v[6:7], v0, off
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
   %vdst = extractvalue { i32, i32 } %pair, 0
@@ -29,8 +27,6 @@ define amdgpu_gs void @test_ds_bvh_stack_1(i32 %addr, i32 %data0, <4 x i32> %dat
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; CHECK-NEXT:    global_store_b32 v[6:7], v0, off
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
   %vdst = extractvalue { i32, i32 } %pair, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
index 96c35df3c234fb..63d4bac20b354e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
@@ -20,8 +20,6 @@ define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-NEXT:    ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_b32 v[1:2], v3, off
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
   store i32 %res, ptr addrspace(1) %out, align 4
@@ -43,8 +41,6 @@ define amdgpu_gs void @test_sub_64_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-NEXT:    ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_b64 v[1:2], v[3:4], off
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %res = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
   store i64 %res, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index dad77d1efd3a85..18b03efe44efa9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -21,8 +21,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -44,8 +42,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -72,8 +68,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -95,8 +89,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -130,8 +122,6 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32:
@@ -157,8 +147,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq:
@@ -184,8 +172,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq:
@@ -215,8 +201,6 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_one:
@@ -242,8 +226,6 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_one:
@@ -273,8 +255,6 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt:
@@ -300,8 +280,6 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt:
@@ -331,8 +309,6 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_oge:
@@ -358,8 +334,6 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_oge:
@@ -389,8 +363,6 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_olt:
@@ -416,8 +388,6 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_olt:
@@ -447,8 +417,6 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_ole:
@@ -474,8 +442,6 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_ole:
@@ -505,8 +471,6 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_o:
@@ -532,8 +496,6 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_o:
@@ -563,8 +525,6 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_uo:
@@ -590,8 +550,6 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_uo:
@@ -621,8 +579,6 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq:
@@ -648,8 +604,6 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq:
@@ -679,8 +633,6 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_une:
@@ -706,8 +658,6 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_une:
@@ -737,8 +687,6 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt:
@@ -764,8 +712,6 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt:
@@ -795,8 +741,6 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_uge:
@@ -822,8 +766,6 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_uge:
@@ -853,8 +795,6 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_ult:
@@ -880,8 +820,6 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_ult:
@@ -911,8 +849,6 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f32_ule:
@@ -938,8 +874,6 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f32_ule:
@@ -967,8 +901,6 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
@@ -990,8 +922,6 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
@@ -1017,8 +947,6 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_one:
@@ -1040,8 +968,6 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_one:
@@ -1067,8 +993,6 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
@@ -1090,8 +1014,6 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
@@ -1117,8 +1039,6 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
@@ -1140,8 +1060,6 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
@@ -1167,8 +1085,6 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
@@ -1190,8 +1106,6 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
@@ -1217,8 +1131,6 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
@@ -1240,8 +1152,6 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
@@ -1267,8 +1177,6 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
@@ -1290,8 +1198,6 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
@@ -1317,8 +1223,6 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_o:
@@ -1340,8 +1244,6 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_o:
@@ -1367,8 +1269,6 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
@@ -1390,8 +1290,6 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
@@ -1417,8 +1315,6 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_une:
@@ -1440,8 +1336,6 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_une:
@@ -1467,8 +1361,6 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
@@ -1490,8 +1382,6 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
@@ -1517,8 +1407,6 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
@@ -1540,8 +1428,6 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
@@ -1567,8 +1453,6 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
@@ -1590,8 +1474,6 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
@@ -1617,8 +1499,6 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
@@ -1640,8 +1520,6 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
@@ -1671,8 +1549,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
 ; SDAG-GFX11-NEXT:    v_cmp_eq_f16_e64 s2, s4, |s2|
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs:
@@ -1700,8 +1576,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
 ; GISEL-GFX11-NEXT:    v_cmp_eq_f16_e64 s2, s4, |s2|
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs:
@@ -1735,8 +1609,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
 ; SDAG-GFX11-NEXT:    v_cmp_eq_f16_e64 s2, |s4|, |s2|
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
@@ -1764,8 +1636,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
 ; GISEL-GFX11-NEXT:    v_cmp_eq_f16_e64 s2, |s4|, |s2|
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
@@ -1802,8 +1672,6 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16:
@@ -1830,8 +1698,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq:
@@ -1857,8 +1723,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq:
@@ -1889,8 +1753,6 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_one:
@@ -1916,8 +1778,6 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_one:
@@ -1948,8 +1808,6 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt:
@@ -1975,8 +1833,6 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt:
@@ -2007,8 +1863,6 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_oge:
@@ -2034,8 +1888,6 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_oge:
@@ -2066,8 +1918,6 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_olt:
@@ -2093,8 +1943,6 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_olt:
@@ -2125,8 +1973,6 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_ole:
@@ -2152,8 +1998,6 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_ole:
@@ -2184,8 +2028,6 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq:
@@ -2211,8 +2053,6 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq:
@@ -2243,8 +2083,6 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_une:
@@ -2270,8 +2108,6 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_une:
@@ -2302,8 +2138,6 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt:
@@ -2329,8 +2163,6 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt:
@@ -2361,8 +2193,6 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_uge:
@@ -2388,8 +2218,6 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_uge:
@@ -2420,8 +2248,6 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_ult:
@@ -2447,8 +2273,6 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_ult:
@@ -2478,8 +2302,6 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_o:
@@ -2505,8 +2327,6 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_o:
@@ -2536,8 +2356,6 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_uo:
@@ -2563,8 +2381,6 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_uo:
@@ -2594,8 +2410,6 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_fcmp_f16_ule:
@@ -2621,8 +2435,6 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_fcmp_f16_ule:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index d1883d9196af65..2dddf37febf945 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -24,8 +24,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs:
@@ -82,8 +80,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs:
@@ -141,8 +137,6 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: v_fcmp_f32:
@@ -187,8 +181,6 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_oeq:
@@ -249,8 +241,6 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_one:
@@ -311,8 +301,6 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_ogt:
@@ -373,8 +361,6 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_oge:
@@ -435,8 +421,6 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_olt:
@@ -497,8 +481,6 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_ole:
@@ -559,8 +541,6 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_o:
@@ -621,8 +601,6 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_uo:
@@ -683,8 +661,6 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_ueq:
@@ -745,8 +721,6 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_une:
@@ -807,8 +781,6 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_ugt:
@@ -869,8 +841,6 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_uge:
@@ -931,8 +901,6 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_ult:
@@ -993,8 +961,6 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f32_ule:
@@ -1053,8 +1019,6 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_oeq:
@@ -1113,8 +1077,6 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_one:
@@ -1173,8 +1135,6 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_ogt:
@@ -1233,8 +1193,6 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_oge:
@@ -1293,8 +1251,6 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_olt:
@@ -1353,8 +1309,6 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_ole:
@@ -1413,8 +1367,6 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_ueq:
@@ -1473,8 +1425,6 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_o:
@@ -1533,8 +1483,6 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_uo:
@@ -1593,8 +1541,6 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_une:
@@ -1653,8 +1599,6 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_ugt:
@@ -1713,8 +1657,6 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_uge:
@@ -1773,8 +1715,6 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_ult:
@@ -1833,8 +1773,6 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f64_ule:
@@ -1898,8 +1836,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs:
@@ -1967,8 +1903,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
@@ -2032,8 +1966,6 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: v_fcmp_f16:
@@ -2079,8 +2011,6 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_oeq:
@@ -2142,8 +2072,6 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_one:
@@ -2205,8 +2133,6 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_ogt:
@@ -2268,8 +2194,6 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_oge:
@@ -2331,8 +2255,6 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_olt:
@@ -2394,8 +2316,6 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_ole:
@@ -2457,8 +2377,6 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_ueq:
@@ -2520,8 +2438,6 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_une:
@@ -2583,8 +2499,6 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_ugt:
@@ -2646,8 +2560,6 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_uge:
@@ -2709,8 +2621,6 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_ult:
@@ -2771,8 +2681,6 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_o:
@@ -2833,8 +2741,6 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_uo:
@@ -2895,8 +2801,6 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_fcmp_f16_ule:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 453913b334a49d..455323d01eb3fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -17,8 +17,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dot2_bf16_bf16 v1, s2, s3, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -70,8 +68,6 @@ define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     <2 x bfloat> inreg %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 1343f25ec275e5..9cf24539828b3c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -16,8 +16,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 8a8b0490e9480b..5a2c4197eef5de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -17,8 +17,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0 clamp
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -47,8 +45,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
index f631a0bfc28eb0..a97f1dcc2bdd49 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -40,8 +40,6 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a
 ; GFX12-SDAG-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn:
@@ -55,8 +53,6 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a
 ; GFX12-GISEL-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 88e3929819313c..e19a4bcf9e0a7b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -16,8 +16,6 @@ define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr
 ; GFX12-NEXT:    global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
@@ -35,8 +33,6 @@ define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr
 ; GFX12-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
@@ -54,8 +50,6 @@ define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr
 ; GFX12-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
@@ -73,8 +67,6 @@ define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, pt
 ; GFX12-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index a5841e04f6e07b..5e1fe792393b9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -16,8 +16,6 @@ define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr ad
 ; GFX12-NEXT:    global_load_tr_b64 v1, v0, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
@@ -35,8 +33,6 @@ define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr
 ; GFX12-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
@@ -54,8 +50,6 @@ define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr
 ; GFX12-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
@@ -73,8 +67,6 @@ define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, pt
 ; GFX12-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index d0d759a57a682c..37174dec520209 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -29,8 +29,6 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_eq:
@@ -56,8 +54,6 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_eq:
@@ -91,8 +87,6 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32:
@@ -118,8 +112,6 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_ne:
@@ -145,8 +137,6 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_ne:
@@ -176,8 +166,6 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_ugt:
@@ -203,8 +191,6 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_ugt:
@@ -234,8 +220,6 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_uge:
@@ -261,8 +245,6 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_uge:
@@ -292,8 +274,6 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_ult:
@@ -319,8 +299,6 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_ult:
@@ -350,8 +328,6 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_ule:
@@ -377,8 +353,6 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_ule:
@@ -408,8 +382,6 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_sgt:
@@ -435,8 +407,6 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_sgt:
@@ -466,8 +436,6 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_sge:
@@ -493,8 +461,6 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_sge:
@@ -524,8 +490,6 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_slt:
@@ -551,8 +515,6 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_slt:
@@ -582,8 +544,6 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i32_sle:
@@ -609,8 +569,6 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i32_sle:
@@ -638,8 +596,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_eq:
@@ -661,8 +617,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_eq:
@@ -688,8 +642,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_ne:
@@ -711,8 +663,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_ne:
@@ -738,8 +688,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
@@ -761,8 +709,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
@@ -788,8 +734,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_uge:
@@ -811,8 +755,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_uge:
@@ -838,8 +780,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ult:
@@ -861,8 +801,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ult:
@@ -888,8 +826,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_u64_ule:
@@ -911,8 +847,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_u64_ule:
@@ -938,8 +872,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
@@ -961,8 +893,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
@@ -988,8 +918,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sge:
@@ -1011,8 +939,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sge:
@@ -1038,8 +964,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_slt:
@@ -1061,8 +985,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_slt:
@@ -1088,8 +1010,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i64_sle:
@@ -1111,8 +1031,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i64_sle:
@@ -1140,8 +1058,6 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_eq:
@@ -1167,8 +1083,6 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_eq:
@@ -1202,8 +1116,6 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16:
@@ -1229,8 +1141,6 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_ne:
@@ -1256,8 +1166,6 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_ne:
@@ -1287,8 +1195,6 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_ugt:
@@ -1314,8 +1220,6 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_ugt:
@@ -1345,8 +1249,6 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_uge:
@@ -1372,8 +1274,6 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_uge:
@@ -1403,8 +1303,6 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_ult:
@@ -1430,8 +1328,6 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_ult:
@@ -1461,8 +1357,6 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_ule:
@@ -1488,8 +1382,6 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_ule:
@@ -1519,8 +1411,6 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_sgt:
@@ -1546,8 +1436,6 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_sgt:
@@ -1577,8 +1465,6 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_sge:
@@ -1604,8 +1490,6 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_sge:
@@ -1635,8 +1519,6 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_slt:
@@ -1662,8 +1544,6 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_slt:
@@ -1693,8 +1573,6 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-GFX10-LABEL: v_icmp_i16_sle:
@@ -1720,8 +1598,6 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: v_icmp_i16_sle:
@@ -1753,8 +1629,6 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_icmp_i1_ne0:
@@ -1790,8 +1664,6 @@ define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(ptr addrspace(1) %out, i32
 ; GISEL-GFX11-LABEL: test_intr_icmp_i32_invalid_cc:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: test_intr_icmp_i32_invalid_cc:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index cad3c54ae54b04..f5f3bc43658e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -34,8 +34,6 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_eq:
@@ -102,8 +100,6 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i32:
@@ -140,8 +136,6 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_ne:
@@ -202,8 +196,6 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_ugt:
@@ -264,8 +256,6 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_uge:
@@ -326,8 +316,6 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_ult:
@@ -388,8 +376,6 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_ule:
@@ -450,8 +436,6 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_sgt:
@@ -512,8 +496,6 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_sge:
@@ -574,8 +556,6 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_slt:
@@ -636,8 +616,6 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i32_sle:
@@ -696,8 +674,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i64_eq:
@@ -756,8 +732,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i64_ne:
@@ -816,8 +790,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_u64_ugt:
@@ -876,8 +848,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_u64_uge:
@@ -936,8 +906,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_u64_ult:
@@ -996,8 +964,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_u64_ule:
@@ -1056,8 +1022,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i64_sgt:
@@ -1116,8 +1080,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i64_sge:
@@ -1176,8 +1138,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i64_slt:
@@ -1236,8 +1196,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i64_sle:
@@ -1298,8 +1256,6 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_eq:
@@ -1366,8 +1322,6 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i16:
@@ -1404,8 +1358,6 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_ne:
@@ -1466,8 +1418,6 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_ugt:
@@ -1528,8 +1478,6 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_uge:
@@ -1590,8 +1538,6 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_ult:
@@ -1652,8 +1598,6 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_ule:
@@ -1714,8 +1658,6 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_sgt:
@@ -1776,8 +1718,6 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_sge:
@@ -1838,8 +1778,6 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_slt:
@@ -1900,8 +1838,6 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; SDAG-VI-LABEL: v_icmp_i16_sle:
@@ -1964,8 +1900,6 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_icmp_i1_ne0:
@@ -2022,8 +1956,6 @@ define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(ptr addrspace(1) %out, i32
 ; GISEL-GFX11-LABEL: test_intr_icmp_i32_invalid_cc:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: test_intr_icmp_i32_invalid_cc:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
index 31ba2f224bbae0..21482ba1adc1e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
@@ -373,23 +373,14 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -403,23 +394,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -434,23 +416,14 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_3d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_3d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_3d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -466,23 +439,14 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_cube:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_cube:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_cube:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -498,23 +462,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -529,23 +484,14 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -561,23 +507,14 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2dmsaa:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2dmsaa:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2dmsaa:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -593,23 +530,14 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; GFX9-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2darraymsaa:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2darraymsaa:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2darraymsaa:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -626,23 +554,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX9-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_1d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_1d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -657,23 +576,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_2d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -689,23 +599,14 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_3d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_3d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_3d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -722,23 +623,14 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_cube:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_cube:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_cube:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -755,23 +647,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_1darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_1darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_1darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -787,23 +670,14 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; GFX9-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_2darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_2darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_2darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1060,23 +934,14 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
 ; GFX9-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_V1:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_V1:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_V1:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_V1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1090,23 +955,14 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
 ; GFX9-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc unorm a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_V2:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_V2:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_V2:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_V2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1192,23 +1048,14 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_glc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_glc:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_glc:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_NT a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1222,23 +1069,14 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_slc:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_slc:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_HT a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1252,23 +1090,14 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
 ; GFX9-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc a16
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_glc_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_glc_slc:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_glc_slc:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_RT_WB a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1344,3 +1173,6 @@ declare void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float>, i32, i16, <8 x i
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
 attributes #2 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
index 7c8a395b488f3a..dca5e671ab58f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
@@ -465,15 +465,11 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
 ; GFX11-LABEL: store_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -495,15 +491,11 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
 ; GFX11-LABEL: store_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -526,15 +518,11 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
 ; GFX11-LABEL: store_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -558,15 +546,11 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
 ; GFX11-LABEL: store_cube:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -590,15 +574,11 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; GFX11-LABEL: store_1darray:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -621,15 +601,11 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; GFX11-LABEL: store_2darray:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -653,15 +629,11 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX11-LABEL: store_2dmsaa:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA a16 ; encoding: [0x46,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -685,15 +657,11 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; GFX11-LABEL: store_2darraymsaa:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY a16 ; encoding: [0x47,0x80,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -718,15 +686,11 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX11-LABEL: store_mip_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_mip_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -749,15 +713,11 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX11-LABEL: store_mip_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_mip_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; encoding: [0x41,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -781,15 +741,11 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX11-LABEL: store_mip_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_mip_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x42,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -814,15 +770,11 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; GFX11-LABEL: store_mip_cube:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_mip_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; encoding: [0x43,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -847,15 +799,11 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; GFX11-LABEL: store_mip_1darray:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_mip_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; encoding: [0x44,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -879,15 +827,11 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; GFX11-LABEL: store_mip_2darray:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_mip_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; encoding: [0x45,0xc0,0xc1,0xd3,0x00,0x00,0x00,0x00,0x04,0x05,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords_lo, i32 0
@@ -1212,15 +1156,11 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
 ; GFX11-LABEL: store_1d_V1:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x02,0x19,0xf0,0x01,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_1d_V1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x80,0x81,0xd0,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1242,15 +1182,11 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
 ; GFX11-LABEL: store_1d_V2:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0c,0x19,0xf0,0x02,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_1d_V2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x40,0x80,0x01,0xd3,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1362,15 +1298,11 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX11-LABEL: store_1d_glc:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x80,0x4f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_1d_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_NT a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x10,0x00,0x04,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1392,15 +1324,11 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
 ; GFX11-LABEL: store_1d_slc:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x80,0x1f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_HT a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
@@ -1422,15 +1350,11 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
 ; GFX11-LABEL: store_1d_glc_slc:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x80,0x5f,0x19,0xf0,0x04,0x00,0x00,0x00]
-; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; GFX12-LABEL: store_1d_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_RT_WB a16 ; encoding: [0x40,0x80,0xc1,0xd3,0x00,0x00,0x30,0x00,0x04,0x00,0x00,0x00]
-; GFX12-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
 ; GFX12-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 main_body:
   %s = extractelement <2 x i16> %coords, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index db72e6c6d0c174..2fced3240fe358 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -2667,23 +2667,14 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %
 ; NOPRT-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2711,23 +2702,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %
 ; NOPRT-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2755,23 +2737,14 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %
 ; NOPRT-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_3d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_3d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_3d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2799,23 +2772,14 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32
 ; NOPRT-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_cube:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_cube:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_cube:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2843,23 +2807,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; NOPRT-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2887,23 +2842,14 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; NOPRT-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2931,23 +2877,14 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
 ; NOPRT-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2dmsaa:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2dmsaa:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2dmsaa:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2dmsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -2975,23 +2912,14 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; NOPRT-NEXT:    image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_2darraymsaa:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_2darraymsaa:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_2darraymsaa:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_2darraymsaa:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3019,23 +2947,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
 ; NOPRT-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_1d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_1d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3063,23 +2982,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
 ; NOPRT-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_2d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3107,23 +3017,14 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
 ; NOPRT-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_3d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_3d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_3d:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3151,23 +3052,14 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
 ; NOPRT-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_cube:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_cube:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_cube:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_cube:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3195,23 +3087,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; NOPRT-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm da
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_1darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_1darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_1darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_1darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3239,23 +3122,14 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
 ; NOPRT-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_mip_2darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_mip_2darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_mip_2darray:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_mip_2darray:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store_mip v[0:3], [v4, v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3693,23 +3567,14 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s)
 ; NOPRT-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_V1:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_V1:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_V1:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_V1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3737,23 +3602,14 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i3
 ; NOPRT-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_V2:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_V2:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_V2:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_V2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3904,23 +3760,14 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
 ; NOPRT-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_glc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_glc:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_glc:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_glc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_NT
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
@@ -3948,23 +3795,14 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
 ; NOPRT-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_slc:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_slc:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_HT
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
@@ -3992,23 +3830,14 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
 ; NOPRT-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: store_1d_glc_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: store_1d_glc_slc:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: store_1d_glc_slc:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_1d_glc_slc:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_STORE_RT_WB
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
@@ -4202,23 +4031,13 @@ define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %a
 ; NOPRT-NEXT:    image_store v[0:3], v4, s[16:23] dmask:0xf unorm
 ; NOPRT-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: image_store_wait:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX10-NEXT:    image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: image_store_wait:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX11-NEXT:    image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX10PLUS-LABEL: image_store_wait:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
+; GFX10PLUS-NEXT:    image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: image_store_wait:
 ; GFX12:       ; %bb.0: ; %main_body
@@ -4226,8 +4045,6 @@ define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %a
 ; GFX12-NEXT:    image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
index 554b961beaf7b3..fe76d9ca1ab5e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
@@ -18,15 +18,11 @@ define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2
 ; GFX11-LABEL: store_f16_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_f16_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -49,15 +45,11 @@ define amdgpu_ps void @store_v2f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v2f16_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v2f16_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -80,15 +72,11 @@ define amdgpu_ps void @store_v3f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v3f16_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v3f16_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -111,15 +99,11 @@ define amdgpu_ps void @store_v4f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v4f16_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v4f16_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -142,15 +126,11 @@ define amdgpu_ps void @store_f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2
 ; GFX11-LABEL: store_f16_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_f16_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -174,15 +154,11 @@ define amdgpu_ps void @store_v2f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v2f16_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v2f16_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -206,15 +182,11 @@ define amdgpu_ps void @store_v3f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v3f16_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v3f16_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -238,15 +210,11 @@ define amdgpu_ps void @store_v4f16_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v4f16_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v4f16_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -270,15 +238,11 @@ define amdgpu_ps void @store_f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo,
 ; GFX11-LABEL: store_f16_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_f16_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:3], [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -303,15 +267,11 @@ define amdgpu_ps void @store_v2f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX11-LABEL: store_v2f16_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v2f16_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:3], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -336,15 +296,11 @@ define amdgpu_ps void @store_v3f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX11-LABEL: store_v3f16_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v3f16_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:3], [v0, v1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -369,15 +325,11 @@ define amdgpu_ps void @store_v4f16_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX11-LABEL: store_v4f16_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 d16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v4f16_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:3], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 d16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
index 5d4c91ad8a510f..1110892f6ac08c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
@@ -18,15 +18,11 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4
 ; GFX11-LABEL: store_f32_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_f32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -48,15 +44,11 @@ define amdgpu_ps void @store_v2f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v2f32_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v2f32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -78,15 +70,11 @@ define amdgpu_ps void @store_v3f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v3f32_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v3f32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -108,15 +96,11 @@ define amdgpu_ps void @store_v4f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v4f32_1d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v4f32_1d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -138,15 +122,11 @@ define amdgpu_ps void @store_f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4
 ; GFX11-LABEL: store_f32_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_f32_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -169,15 +149,11 @@ define amdgpu_ps void @store_v2f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v2f32_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v2f32_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -200,15 +176,11 @@ define amdgpu_ps void @store_v3f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v3f32_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v3f32_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -231,15 +203,11 @@ define amdgpu_ps void @store_v4f32_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords,
 ; GFX11-LABEL: store_v4f32_2d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v4f32_2d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -262,15 +230,11 @@ define amdgpu_ps void @store_f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo,
 ; GFX11-LABEL: store_f32_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_f32_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -294,15 +258,11 @@ define amdgpu_ps void @store_v2f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX11-LABEL: store_v2f32_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v2f32_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -326,15 +286,11 @@ define amdgpu_ps void @store_v3f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX11-LABEL: store_v3f32_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v3f32_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -358,15 +314,11 @@ define amdgpu_ps void @store_v4f32_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
 ; GFX11-LABEL: store_v4f32_3d:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_v4f32_3d:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    image_store v[2:5], [v0, v1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
index 3781faa54e7dc6..d08f826e80625f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
@@ -17,8 +17,6 @@ define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0)
@@ -35,8 +33,6 @@ define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF)
@@ -51,8 +47,6 @@ define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000)
@@ -69,8 +63,6 @@ define amdgpu_cs void @vgpr_inverse_ballot(i32 %input, ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
@@ -84,8 +76,6 @@ define amdgpu_cs void @sgpr_inverse_ballot(i32 inreg %input, ptr addrspace(1) %o
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
@@ -105,8 +95,6 @@ define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace
 ; GFX11-NEXT:  .LBB5_2: ; %endif
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %cc = icmp ne i32 %s2, 0
@@ -140,8 +128,6 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr
 ; GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: inverse_ballot_branch:
@@ -155,8 +141,6 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr
 ; SDAG-NEXT:  ; %bb.2: ; %endif
 ; SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; SDAG-NEXT:    global_store_b32 v[0:1], v2, off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index 29218a3625216a..2f82ceb37eb90c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -18,8 +18,6 @@ define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: constant_false_inverse_ballot:
@@ -29,8 +27,6 @@ define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 0)
@@ -48,8 +44,6 @@ define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: constant_true_inverse_ballot:
@@ -59,8 +53,6 @@ define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0xFFFFFFFFFFFFFFFF)
@@ -79,8 +71,6 @@ define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) {
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: constant_mask_inverse_ballot:
@@ -91,8 +81,6 @@ define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) {
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0x0040F8010000)
@@ -111,8 +99,6 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) {
 ; GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[2:3], v[4:5], off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: vgpr_inverse_ballot:
@@ -123,8 +109,6 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) {
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
@@ -139,8 +123,6 @@ define amdgpu_cs void @sgpr_inverse_ballot(i64 inreg %input, ptr addrspace(1) %o
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: sgpr_inverse_ballot:
@@ -150,8 +132,6 @@ define amdgpu_cs void @sgpr_inverse_ballot(i64 inreg %input, ptr addrspace(1) %o
 ; SDAG-NEXT:    s_waitcnt_depctr 0xfffe
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s0
 ; SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
@@ -173,8 +153,6 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: phi_uniform:
@@ -189,8 +167,6 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s2
 ; SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %cc = icmp ne i64 %s2, 0
@@ -226,8 +202,6 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr
 ; GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 ;
 ; SDAG-LABEL: inverse_ballot_branch:
@@ -244,8 +218,6 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr
 ; SDAG-NEXT:  ; %bb.2: ; %endif
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 entry:
   %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %s2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 42e8b2608dc1c0..80cd97c0c262c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -98,8 +98,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index f8e60e5eb09a16..8383621cef2f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index c4a86952bc4146..aa6069c67f62ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -34,8 +34,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_i32:
@@ -48,8 +46,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -79,8 +75,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_f32:
@@ -93,8 +87,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -142,8 +134,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64:
@@ -158,8 +148,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64:
@@ -174,8 +162,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64:
@@ -190,8 +176,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -239,8 +223,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64:
@@ -255,8 +237,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64:
@@ -271,8 +251,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64:
@@ -287,8 +265,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -318,8 +294,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vii_i32:
@@ -330,8 +304,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -361,8 +333,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vii_f32:
@@ -373,8 +343,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -416,8 +384,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64:
@@ -430,8 +396,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64:
@@ -444,8 +408,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64:
@@ -458,8 +420,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -501,8 +461,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64:
@@ -515,8 +473,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64:
@@ -529,8 +485,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64:
@@ -543,8 +497,6 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -577,8 +529,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vll_i32:
@@ -591,8 +541,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -638,8 +586,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64:
@@ -654,8 +600,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64:
@@ -670,8 +614,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64:
@@ -686,8 +628,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -719,8 +659,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vll_f32:
@@ -733,8 +671,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -780,8 +716,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64:
@@ -796,8 +730,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64:
@@ -812,8 +744,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64:
@@ -828,8 +758,6 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -868,8 +796,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32:
@@ -887,8 +813,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32:
@@ -905,8 +829,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32:
@@ -922,8 +844,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -977,8 +897,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vvv_i64:
@@ -997,8 +915,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -1039,8 +955,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32:
@@ -1058,8 +972,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32:
@@ -1076,8 +988,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32:
@@ -1093,8 +1003,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -1148,8 +1056,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vvv_f64:
@@ -1168,8 +1074,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -1211,8 +1115,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32:
@@ -1225,8 +1127,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32:
@@ -1239,8 +1139,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32:
@@ -1253,8 +1151,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -1308,8 +1204,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s1, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vvs_i64:
@@ -1327,8 +1221,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s1, s0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -1369,8 +1261,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32:
@@ -1383,8 +1273,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32:
@@ -1397,8 +1285,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32:
@@ -1411,8 +1297,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -1466,8 +1350,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s1, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vvs_f64:
@@ -1485,8 +1367,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s1, s0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -1517,8 +1397,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32:
@@ -1532,8 +1410,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32:
@@ -1547,8 +1423,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32:
@@ -1562,8 +1436,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -1617,8 +1489,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64:
@@ -1636,8 +1506,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64:
@@ -1655,8 +1523,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64:
@@ -1674,8 +1540,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -1706,8 +1570,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32:
@@ -1721,8 +1583,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32:
@@ -1736,8 +1596,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32:
@@ -1751,8 +1609,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -1806,8 +1662,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64:
@@ -1825,8 +1679,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64:
@@ -1844,8 +1696,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64:
@@ -1863,8 +1713,6 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -1895,8 +1743,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_fi_i32:
@@ -1909,8 +1755,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -1958,8 +1802,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
@@ -1974,8 +1816,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
@@ -1990,8 +1830,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
@@ -2006,8 +1844,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -2037,8 +1873,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_fi_f32:
@@ -2051,8 +1885,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -2100,8 +1932,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
@@ -2116,8 +1946,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
@@ -2132,8 +1960,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
@@ -2148,8 +1974,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -2179,8 +2003,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_bc_i32:
@@ -2193,8 +2015,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -2242,8 +2062,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
@@ -2258,8 +2076,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
@@ -2274,8 +2090,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
@@ -2290,8 +2104,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i64 %v, ptr addrspace(1) %out
@@ -2321,8 +2133,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_bc_f32:
@@ -2335,8 +2145,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store float %v, ptr addrspace(1) %out
@@ -2384,8 +2192,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
@@ -2400,8 +2206,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
@@ -2416,8 +2220,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
@@ -2432,8 +2234,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store double %v, ptr addrspace(1) %out
@@ -2463,8 +2263,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32:
@@ -2477,8 +2275,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -2526,8 +2322,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
@@ -2542,8 +2336,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
@@ -2558,8 +2350,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
@@ -2574,8 +2364,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i64 %v, ptr addrspace(1) %out
@@ -2605,8 +2393,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32:
@@ -2619,8 +2405,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store float %v, ptr addrspace(1) %out
@@ -2668,8 +2452,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
@@ -2684,8 +2466,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
@@ -2700,8 +2480,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
@@ -2716,8 +2494,6 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store double %v, ptr addrspace(1) %out
@@ -2747,8 +2523,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_i32:
@@ -2761,8 +2535,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -2792,8 +2564,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_f32:
@@ -2806,8 +2576,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -2855,8 +2623,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64:
@@ -2871,8 +2637,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64:
@@ -2887,8 +2651,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64:
@@ -2903,8 +2665,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -2952,8 +2712,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64:
@@ -2968,8 +2726,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64:
@@ -2984,8 +2740,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64:
@@ -3000,8 +2754,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -3031,8 +2783,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vii_i32:
@@ -3043,8 +2793,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -3074,8 +2822,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vii_f32:
@@ -3086,8 +2832,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -3129,8 +2873,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64:
@@ -3143,8 +2885,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64:
@@ -3157,8 +2897,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64:
@@ -3171,8 +2909,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -3214,8 +2950,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64:
@@ -3228,8 +2962,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64:
@@ -3242,8 +2974,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64:
@@ -3256,8 +2986,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -3290,8 +3018,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vll_i32:
@@ -3304,8 +3030,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -3337,8 +3061,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vll_f32:
@@ -3351,8 +3073,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -3398,8 +3118,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64:
@@ -3414,8 +3132,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64:
@@ -3430,8 +3146,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64:
@@ -3446,8 +3160,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -3493,8 +3205,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64:
@@ -3509,8 +3219,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64:
@@ -3525,8 +3233,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64:
@@ -3541,8 +3247,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -3581,8 +3285,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32:
@@ -3600,8 +3302,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32:
@@ -3618,8 +3318,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32:
@@ -3635,8 +3333,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -3677,8 +3373,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32:
@@ -3696,8 +3390,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32:
@@ -3714,8 +3406,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32:
@@ -3731,8 +3421,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -3786,8 +3474,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vvv_i64:
@@ -3806,8 +3492,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -3861,8 +3545,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vvv_f64:
@@ -3881,8 +3563,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
@@ -3924,8 +3604,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
@@ -3938,8 +3616,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
@@ -3952,8 +3628,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
@@ -3966,8 +3640,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -4008,8 +3680,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
@@ -4022,8 +3692,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
@@ -4036,8 +3704,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
@@ -4050,8 +3716,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -4105,8 +3769,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vvs_i64:
@@ -4124,8 +3786,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -4179,8 +3839,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vvs_f64:
@@ -4198,8 +3856,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s1, s0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
@@ -4230,8 +3886,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
@@ -4245,8 +3899,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
@@ -4260,8 +3912,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
@@ -4275,8 +3925,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -4307,8 +3955,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
@@ -4322,8 +3968,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
@@ -4337,8 +3981,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
@@ -4352,8 +3994,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -4407,8 +4047,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
@@ -4426,8 +4064,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
@@ -4445,8 +4081,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
@@ -4464,8 +4098,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -4519,8 +4151,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
@@ -4538,8 +4168,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
@@ -4557,8 +4185,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
@@ -4576,8 +4202,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
@@ -4608,8 +4232,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32:
@@ -4622,8 +4244,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -4653,8 +4273,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32:
@@ -4667,8 +4285,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -4716,8 +4332,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
@@ -4732,8 +4346,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
@@ -4748,8 +4360,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
@@ -4764,8 +4374,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -4813,8 +4421,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
@@ -4829,8 +4435,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
@@ -4845,8 +4449,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
@@ -4861,8 +4463,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -4892,8 +4492,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32:
@@ -4906,8 +4504,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -4937,8 +4533,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32:
@@ -4951,8 +4545,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store float %v, ptr addrspace(1) %out
@@ -5000,8 +4592,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
@@ -5016,8 +4606,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
@@ -5032,8 +4620,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
@@ -5048,8 +4634,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i64 %v, ptr addrspace(1) %out
@@ -5097,8 +4681,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
@@ -5113,8 +4695,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
@@ -5129,8 +4709,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
@@ -5145,8 +4723,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store double %v, ptr addrspace(1) %out
@@ -5176,8 +4752,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
@@ -5190,8 +4764,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -5221,8 +4793,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
@@ -5235,8 +4805,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store float %v, ptr addrspace(1) %out
@@ -5284,8 +4852,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
@@ -5300,8 +4866,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
@@ -5316,8 +4880,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
@@ -5332,8 +4894,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i64 %v, ptr addrspace(1) %out
@@ -5381,8 +4941,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
@@ -5397,8 +4955,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
@@ -5413,8 +4969,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
@@ -5429,8 +4983,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store double %v, ptr addrspace(1) %out
@@ -5459,8 +5011,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_tid_tid_i32:
@@ -5473,8 +5023,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -5504,8 +5052,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_tid_tid_f32:
@@ -5518,8 +5064,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -5567,8 +5111,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
@@ -5583,8 +5125,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
@@ -5599,8 +5139,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
@@ -5615,8 +5153,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -5665,8 +5201,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
@@ -5682,8 +5216,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
@@ -5699,8 +5231,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
@@ -5716,8 +5246,6 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -5749,8 +5277,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_undef_tid_i32:
@@ -5763,8 +5289,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -5795,8 +5319,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_undef_tid_f32:
@@ -5809,8 +5331,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -5857,8 +5377,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
@@ -5873,8 +5391,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
@@ -5889,8 +5405,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
@@ -5905,8 +5419,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -5956,8 +5468,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
@@ -5973,8 +5483,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
@@ -5990,8 +5498,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
@@ -6007,8 +5513,6 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -6055,8 +5559,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
@@ -6071,8 +5573,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
@@ -6086,8 +5586,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
@@ -6102,8 +5600,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -6147,8 +5643,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
@@ -6163,8 +5657,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
@@ -6178,8 +5670,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
@@ -6194,8 +5684,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -6245,8 +5733,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
@@ -6261,8 +5747,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
@@ -6277,8 +5761,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
@@ -6293,8 +5775,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -6348,8 +5828,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
@@ -6366,8 +5844,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v2, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v3, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
@@ -6384,8 +5860,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
@@ -6402,8 +5876,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -6435,8 +5907,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32:
@@ -6449,8 +5919,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -6481,8 +5949,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32:
@@ -6495,8 +5961,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -6543,8 +6007,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
@@ -6559,8 +6021,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
@@ -6575,8 +6035,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
@@ -6591,8 +6049,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -6642,8 +6098,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
@@ -6659,8 +6113,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
@@ -6676,8 +6128,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
@@ -6693,8 +6143,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -6727,8 +6175,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32:
@@ -6741,8 +6187,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -6773,8 +6217,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32:
@@ -6787,8 +6229,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -6835,8 +6275,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
@@ -6851,8 +6289,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
@@ -6867,8 +6303,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
@@ -6883,8 +6317,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -6934,8 +6366,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
@@ -6951,8 +6381,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
@@ -6968,8 +6396,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
@@ -6985,8 +6411,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -7019,8 +6443,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
@@ -7033,8 +6455,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -7065,8 +6485,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
@@ -7079,8 +6497,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -7127,8 +6543,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
@@ -7143,8 +6557,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
@@ -7159,8 +6571,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
@@ -7175,8 +6585,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -7226,8 +6634,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
@@ -7243,8 +6649,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
@@ -7260,8 +6664,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
@@ -7277,8 +6679,6 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -7311,8 +6711,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32:
@@ -7325,8 +6723,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -7356,8 +6752,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32:
@@ -7370,8 +6764,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -7419,8 +6811,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
@@ -7435,8 +6825,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
@@ -7451,8 +6839,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
@@ -7467,8 +6853,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -7517,8 +6901,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
@@ -7534,8 +6916,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
@@ -7551,8 +6931,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
@@ -7568,8 +6946,6 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -7601,8 +6977,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32:
@@ -7615,8 +6989,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -7647,8 +7019,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32:
@@ -7661,8 +7031,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -7709,8 +7077,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
@@ -7725,8 +7091,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
@@ -7741,8 +7105,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
@@ -7757,8 +7119,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -7808,8 +7168,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
@@ -7825,8 +7183,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
@@ -7842,8 +7198,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
@@ -7859,8 +7213,6 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -7907,8 +7259,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
@@ -7923,8 +7273,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
@@ -7938,8 +7286,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
@@ -7954,8 +7300,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -7999,8 +7343,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
@@ -8015,8 +7357,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
@@ -8030,8 +7370,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
@@ -8046,8 +7384,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -8097,8 +7433,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
@@ -8113,8 +7447,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
@@ -8129,8 +7461,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
@@ -8145,8 +7475,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -8200,8 +7528,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v1, s0, s1
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v0, s0, s1
 ; GFX11-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
@@ -8218,8 +7544,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v2, v0, s0, s1
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v3, v1, s0, s1
 ; GFX11-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
@@ -8236,8 +7560,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v1, s0, s1
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v0, s0, s1
 ; GFX12-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
@@ -8254,8 +7576,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v0, s0, s1
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v3, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -8287,8 +7607,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32:
@@ -8301,8 +7619,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -8333,8 +7649,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32:
@@ -8347,8 +7661,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -8395,8 +7707,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
@@ -8411,8 +7721,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
@@ -8427,8 +7735,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
@@ -8443,8 +7749,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -8494,8 +7798,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
@@ -8511,8 +7813,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
@@ -8528,8 +7828,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
@@ -8545,8 +7843,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -8579,8 +7875,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32:
@@ -8593,8 +7887,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -8625,8 +7917,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32:
@@ -8639,8 +7929,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -8687,8 +7975,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
@@ -8703,8 +7989,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
@@ -8719,8 +8003,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
@@ -8735,8 +8017,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -8786,8 +8066,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
@@ -8803,8 +8081,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
@@ -8820,8 +8096,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
@@ -8837,8 +8111,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -8871,8 +8143,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
@@ -8885,8 +8155,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -8917,8 +8185,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
@@ -8931,8 +8197,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
@@ -8979,8 +8243,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
@@ -8995,8 +8257,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
@@ -9011,8 +8271,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
@@ -9027,8 +8285,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
@@ -9078,8 +8334,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
@@ -9095,8 +8349,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
@@ -9112,8 +8364,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
@@ -9129,8 +8379,6 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index a65143255bbb4e..59be9f8641c1ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -17,8 +17,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv:
@@ -30,8 +28,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -48,8 +44,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi:
@@ -61,8 +55,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 1, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -79,8 +71,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl:
@@ -92,8 +82,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 49617, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -110,8 +98,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv:
@@ -123,8 +109,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %tidx, i1 false, i1 false)
@@ -142,8 +126,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi:
@@ -155,8 +137,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -173,8 +153,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc:
@@ -186,8 +164,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -204,8 +180,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc:
@@ -217,8 +191,6 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -235,8 +207,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv:
@@ -248,8 +218,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -266,8 +234,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi:
@@ -279,8 +245,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 1, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -297,8 +261,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl:
@@ -310,8 +272,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 49617, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -328,8 +288,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv:
@@ -341,8 +299,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v1, v0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %tidx, i1 false, i1 false)
@@ -360,8 +316,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi:
@@ -373,8 +327,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -391,8 +343,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc:
@@ -404,8 +354,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 false, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -422,8 +370,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc:
@@ -435,8 +381,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src0, i32 %src1, i1 true, i1 true)
   store i32 %v, ptr addrspace(1) %out
@@ -455,8 +399,6 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid:
@@ -470,8 +412,6 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %tidx, i32 %tidx, i32 %src1, i1 false, i1 false)
@@ -491,8 +431,6 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid:
@@ -506,8 +444,6 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -528,8 +464,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v1, v0, v2
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid:
@@ -545,8 +479,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v1, v0, v2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.var(i32 12345, i32 %tidx, i32 %src1, i1 false, i1 false)
@@ -566,8 +498,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi:
@@ -581,8 +511,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -603,8 +531,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc:
@@ -618,8 +544,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -640,8 +564,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc:
@@ -655,8 +577,6 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -677,8 +597,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid:
@@ -692,8 +610,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %tidx, i32 %tidx, i32 %src1, i1 false, i1 false)
@@ -713,8 +629,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid:
@@ -728,8 +642,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -750,8 +662,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v1, v0, v2
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid:
@@ -767,8 +677,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v1, v0, v2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 12345, i32 %tidx, i32 %src1, i1 false, i1 false)
@@ -788,8 +696,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi:
@@ -803,8 +709,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -825,8 +729,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc:
@@ -840,8 +742,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
@@ -862,8 +762,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc:
@@ -877,8 +775,6 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
 ; GFX12-GISEL-NEXT:    v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index abb2f877781879..216731519731a0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -16,8 +16,6 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
   store i32 %v, ptr addrspace(1) %out
@@ -33,8 +31,6 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane64(i32 99)
   store i32 %v, ptr addrspace(1) %out
@@ -50,8 +46,6 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
 ; GFX11-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
index afa3fe8c2f1fbd..22c369e2da72e4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -12,8 +12,6 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v1, v0
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v2
 ; GFX11-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0)
   store ptr %v, ptr addrspace(1) %out
@@ -42,8 +40,6 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) {
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    global_store_b64 v6, v[4:5], s[2:3] offset:16
 ; GFX11-SDAG-NEXT:    global_store_b128 v6, v[0:3], s[2:3]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call <3 x ptr> @llvm.amdgcn.permlane64.v3p0(<3 x ptr> %src0)
   store <3 x ptr> %v, ptr addrspace(1) %out
@@ -61,8 +57,6 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0)
   store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -85,8 +79,6 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
 ; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0)
   store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -104,8 +96,6 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0)
   store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -128,8 +118,6 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
 ; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0)
   store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -147,8 +135,6 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v0
 ; GFX11-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0)
   store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -171,8 +157,6 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SDAG-NEXT:    v_permlane64_b32 v0, v3
 ; GFX11-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0)
   store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index c951a07a44ae7d..afc5807e4f5f69 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -68,8 +68,6 @@ define amdgpu_cs void @test_quadmask_sgpr_i32(i32 inreg %mask, ptr addrspace(1)
 ; GFX11-NEXT:    s_quadmask_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %mask)
@@ -154,8 +152,6 @@ define amdgpu_cs void @test_quadmask_sgpr_i64(i64 inreg %mask, ptr addrspace(1)
 ; GFX11-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
index af8023788d2e69..6e24717a2827d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -51,8 +51,6 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_i8_tfe:
@@ -64,8 +62,6 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { i8, i32 } @llvm.amdgcn.raw.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { i8, i32 } %res, 0
@@ -119,8 +115,6 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_i16_tfe:
@@ -132,8 +126,6 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { i16, i32 } @llvm.amdgcn.raw.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { i16, i32 } %res, 0
@@ -187,8 +179,6 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_f16_tfe:
@@ -200,8 +190,6 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { half, i32 } @llvm.amdgcn.raw.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { half, i32 } %res, 0
@@ -255,8 +243,6 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_i32_tfe:
@@ -268,8 +254,6 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { i32, i32 } @llvm.amdgcn.raw.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { i32, i32 } %res, 0
@@ -343,8 +327,6 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_v2i32_tfe:
@@ -356,8 +338,6 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <2 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x i32>, i32 } %res, 0
@@ -431,8 +411,6 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_v2f32_tfe:
@@ -444,8 +422,6 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <2 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x float>, i32 } %res, 0
@@ -524,8 +500,6 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_v3i32_tfe:
@@ -538,8 +512,6 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <3 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x i32>, i32 } %res, 0
@@ -618,8 +590,6 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_v3f32_tfe:
@@ -632,8 +602,6 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <3 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x float>, i32 } %res, 0
@@ -699,8 +667,6 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_v4i32_tfe:
@@ -713,8 +679,6 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <4 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x i32>, i32 } %res, 0
@@ -780,8 +744,6 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: raw_buffer_load_v4f32_tfe:
@@ -794,8 +756,6 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <4 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x float>, i32 } %res, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index 75da1adc3123cc..fd6e354b274a42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -17,8 +17,6 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 glc
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 slc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
@@ -36,8 +34,6 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
 ; GFX11-LABEL: buffer_store_immoffs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
@@ -53,8 +49,6 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
 ; GFX11-LABEL: buffer_store_ofs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
@@ -86,8 +80,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32,
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], 0 offen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
@@ -105,8 +97,6 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %
 ; GFX11-LABEL: buffer_store_x1:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
@@ -122,8 +112,6 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data,
 ; GFX11-LABEL: buffer_store_x2:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
@@ -142,8 +130,6 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -174,8 +160,6 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i3
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = shl i32 %inp, 6
   %a1 = add i32 %a, 4
@@ -207,8 +191,6 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsr
 ; GFX11-NEXT:    buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc
 ; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -234,8 +216,6 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i
 ; GFX11-LABEL: buffer_store_x2_offen_merged_and:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -255,8 +235,6 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = shl i32 %inp, 4
   %a1 = add i32 %a, 4
@@ -278,8 +256,6 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, floa
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
@@ -299,8 +275,6 @@ define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x
 ; GFX11-LABEL: buffer_store_x2_offset_merged:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -321,8 +295,6 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 glc
 ; GFX11-NEXT:    buffer_store_b32 v6, off, s[0:3], 0 slc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
@@ -342,8 +314,6 @@ define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
@@ -363,8 +333,6 @@ define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1)
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
@@ -382,8 +350,6 @@ define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) {
 ; GFX11-LABEL: raw_buffer_store_f16:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
@@ -410,8 +376,6 @@ define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %dat
 ; GFX11-LABEL: buffer_store_v2f16:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
@@ -440,8 +404,6 @@ define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %dat
 ; GFX11-LABEL: buffer_store_v4f16:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
@@ -457,8 +419,6 @@ define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) {
 ; GFX11-LABEL: raw_buffer_store_i16:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
@@ -483,8 +443,6 @@ define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data
 ; GFX11-LABEL: buffer_store_v2i16:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
@@ -511,8 +469,6 @@ define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data
 ; GFX11-LABEL: buffer_store_v4i16:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
@@ -531,8 +487,6 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
@@ -563,8 +517,6 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
 ; GFX11-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 offset:16
 ; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 offset:28
 ; GFX11-NEXT:    buffer_store_b32 v5, off, s[0:3], 0 offset:32
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index 82dd35ab4c2407..e1f84dcbaa607b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -31,8 +31,6 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d
 ; GFX11-LABEL: buffer_store_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -66,8 +64,6 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf
 ; GFX11-LABEL: buffer_store_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -105,8 +101,6 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf
 ; GFX11-LABEL: buffer_store_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -158,8 +152,6 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf
 ; GFX11-LABEL: buffer_store_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
index 30f04f1ff220cb..af84994ca8c4c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
@@ -42,8 +42,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0)
@@ -90,8 +88,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0)
@@ -146,8 +142,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -202,8 +196,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll
index 22396631b38df3..9440efefe18a9c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll
@@ -28,8 +28,6 @@ define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x f
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] glc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -56,8 +54,6 @@ define amdgpu_ps void @tbuffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>
 ; GFX11-LABEL: tbuffer_store_immoffs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -79,8 +75,6 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(ptr addrspace(8) inreg,
 ; GFX11-LABEL: tbuffer_store_scalar_and_imm_offs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -102,8 +96,6 @@ define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float> %vda
 ; GFX11-LABEL: buffer_store_ofs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -125,8 +117,6 @@ define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data
 ; GFX11-LABEL: buffer_store_x1:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_x v0, off, s[0:3], 0 format:125
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast float %data to i32
@@ -148,8 +138,6 @@ define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float>
 ; GFX11-LABEL: buffer_store_x2:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast <2 x float> %data to <2 x i32>
@@ -171,8 +159,6 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(ptr addrspace(8) inreg %
 ; GFX11-LABEL: buffer_store_voffset_large_12bit:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 4092, i32 0, i32 63, i32 0)
@@ -196,8 +182,6 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(ptr addrspace(8) inreg %
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 8188, i32 0, i32 63, i32 0)
@@ -221,8 +205,6 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(ptr addrspace(8) inreg %
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0xf000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 65532, i32 0, i32 63, i32 0)
@@ -246,8 +228,6 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(ptr addrspace(8) inreg %
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0x7ff000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 8388604, i32 0, i32 63, i32 0)
@@ -271,8 +251,6 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(ptr addrspace(8) inreg %
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0xfff000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 16777212, i32 0, i32 63, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index a241bdeaff1a75..eb349cbbe44d17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -44,8 +44,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-LABEL: tbuffer_store_d16_x:
@@ -56,8 +54,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
 ; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-NEXT:    s_nop 0
-; GFX12-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
@@ -104,8 +100,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
 ; GFX11-PACKED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -116,8 +110,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
 ; GFX12-PACKED-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-NEXT:    s_nop 0
-; GFX12-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
@@ -172,8 +164,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz:
@@ -186,8 +176,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-SDAG-NEXT:    tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-SDAG-NEXT:    s_nop 0
-; GFX12-PACKED-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz:
@@ -201,8 +189,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-GISEL-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-GISEL-NEXT:    s_nop 0
-; GFX12-PACKED-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-GISEL-NEXT:    s_endpgm
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -257,8 +243,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -270,8 +254,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-NEXT:    s_nop 0
-; GFX12-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
index 8641bf1b03f36a..118fed169696ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
@@ -30,8 +30,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>,
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] glc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: tbuffer_store:
@@ -41,8 +39,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>,
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[4:7], off, s[0:3], null format:[BUF_FMT_32_32_32_32_UINT] th:TH_STORE_NT
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[8:11], off, s[0:3], null format:78 th:TH_STORE_HT
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[8:11], off, s[0:3], null format:78 th:TH_STORE_RT_NT
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -69,15 +65,11 @@ define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
 ; GFX11-LABEL: tbuffer_store_immoffs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: tbuffer_store_immoffs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:117 offset:42
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -99,15 +91,11 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x f
 ; GFX11-LABEL: tbuffer_store_scalar_and_imm_offs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: tbuffer_store_scalar_and_imm_offs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -129,15 +117,11 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32
 ; GFX11-LABEL: buffer_store_ofs:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_ofs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:115 offen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -159,15 +143,11 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) {
 ; GFX11-LABEL: buffer_store_x1:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_x v0, off, s[0:3], 0 format:125
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_x1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_x v0, off, s[0:3], null format:125
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast float %data to i32
@@ -189,15 +169,11 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data)
 ; GFX11-LABEL: buffer_store_x2:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_x2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xy v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast <2 x float> %data to <2 x i32>
@@ -219,15 +195,11 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(<4 x i32> inreg %rsrc, <
 ; GFX11-LABEL: buffer_store_voffset_large_12bit:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_12bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 4092, i32 0, i32 63, i32 0)
@@ -251,15 +223,11 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(<4 x i32> inreg %rsrc, <
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_13bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8188
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 8188, i32 0, i32 63, i32 0)
@@ -283,15 +251,11 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(<4 x i32> inreg %rsrc, <
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0xf000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_16bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:65532
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 65532, i32 0, i32 63, i32 0)
@@ -315,15 +279,11 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(<4 x i32> inreg %rsrc, <
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0x7ff000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_23bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], off, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offset:8388604
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 8388604, i32 0, i32 63, i32 0)
@@ -347,16 +307,12 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(<4 x i32> inreg %rsrc, <
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0xfff000
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_24bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:8388604
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 16777212, i32 0, i32 63, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index 6cc2393d598e04..ce536a36b76044 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -89,8 +89,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: uniform_value:
@@ -102,8 +100,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: uniform_value:
@@ -114,8 +110,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: uniform_value:
@@ -126,8 +120,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
     %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
@@ -199,8 +191,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: const_value:
@@ -210,8 +200,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: const_value:
@@ -220,8 +208,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: const_value:
@@ -230,8 +216,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
     %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
@@ -452,8 +436,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: divergent_value:
@@ -476,8 +458,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: divergent_value:
@@ -499,8 +479,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: divergent_value:
@@ -522,8 +500,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
     %id.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -847,8 +823,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: divergent_cfg:
@@ -886,8 +860,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: divergent_cfg:
@@ -926,8 +898,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: divergent_cfg:
@@ -964,8 +934,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index f7f8536219db6d..ffb27f40209048 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -90,8 +90,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: uniform_value:
@@ -103,8 +101,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: uniform_value:
@@ -115,8 +111,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: uniform_value:
@@ -127,8 +121,6 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
     %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %in, i32 1)
@@ -200,8 +192,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: const_value:
@@ -211,8 +201,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: const_value:
@@ -221,8 +209,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: const_value:
@@ -231,8 +217,6 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
     %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 123, i32 1)
@@ -453,8 +437,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: divergent_value:
@@ -477,8 +459,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: divergent_value:
@@ -500,8 +480,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: divergent_value:
@@ -523,8 +501,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
     %id.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -848,8 +824,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_nop 0
-; GFX1164DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1164GISEL-LABEL: divergent_cfg:
@@ -887,8 +861,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT:    s_nop 0
-; GFX1164GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164GISEL-NEXT:    s_endpgm
 ;
 ; GFX1132DAGISEL-LABEL: divergent_cfg:
@@ -927,8 +899,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT:    s_nop 0
-; GFX1132DAGISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132DAGISEL-NEXT:    s_endpgm
 ;
 ; GFX1132GISEL-LABEL: divergent_cfg:
@@ -965,8 +935,6 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
-; GFX1132GISEL-NEXT:    s_nop 0
-; GFX1132GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132GISEL-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index d521a6c25e462e..8d99ec2e1b709f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -110,8 +110,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT4-NEXT:    global_load_b32 v0, v[0:1], off
 ; VARIANT4-NEXT:    s_wait_loadcnt 0x0
 ; VARIANT4-NEXT:    global_store_b32 v3, v0, s[0:1]
-; VARIANT4-NEXT:    s_nop 0
-; VARIANT4-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT4-NEXT:    s_endpgm
 ;
 ; VARIANT5-LABEL: test_barrier:
@@ -134,8 +132,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT5-NEXT:    global_load_b32 v0, v[0:1], off
 ; VARIANT5-NEXT:    s_wait_loadcnt 0x0
 ; VARIANT5-NEXT:    global_store_b32 v3, v0, s[0:1]
-; VARIANT5-NEXT:    s_nop 0
-; VARIANT5-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT5-NEXT:    s_endpgm
 ;
 ; VARIANT6-LABEL: test_barrier:
@@ -160,8 +156,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT6-NEXT:    global_load_b32 v0, v[0:1], off
 ; VARIANT6-NEXT:    s_wait_loadcnt 0x0
 ; VARIANT6-NEXT:    global_store_b32 v5, v0, s[0:1]
-; VARIANT6-NEXT:    s_nop 0
-; VARIANT6-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; VARIANT6-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 1e13b40afb8be8..61baca24fbdc0b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -17,8 +17,6 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    s_barrier_signal -1
 ; GFX12-SDAG-NEXT:    s_barrier_wait -1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_barrier_signal:
@@ -35,8 +33,6 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    s_barrier_signal -1
 ; GFX12-GISEL-NEXT:    s_barrier_wait -1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -65,8 +61,6 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    s_barrier_signal 1
 ; GFX12-SDAG-NEXT:    s_barrier_wait 1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test2_s_barrier_signal:
@@ -83,8 +77,6 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    s_barrier_signal 1
 ; GFX12-GISEL-NEXT:    s_barrier_wait 1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -113,8 +105,6 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    s_barrier_signal 0
 ; GFX12-SDAG-NEXT:    s_barrier_wait 0
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test3_s_barrier_signal:
@@ -131,8 +121,6 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    s_barrier_signal 0
 ; GFX12-GISEL-NEXT:    s_barrier_wait 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -162,8 +150,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GFX12-SDAG-NEXT:    s_barrier_signal m0
 ; GFX12-SDAG-NEXT:    s_barrier_wait 1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_barrier_signal_var:
@@ -182,8 +168,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
 ; GFX12-GISEL-NEXT:    s_barrier_signal m0
 ; GFX12-GISEL-NEXT:    s_barrier_wait 1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -247,8 +231,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst:
@@ -272,8 +254,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -307,8 +287,6 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst:
@@ -332,8 +310,6 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -367,8 +343,6 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test3_s_barrier_signal_isfirst:
@@ -392,8 +366,6 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -428,8 +400,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst_var:
@@ -454,8 +424,6 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -557,8 +525,6 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX12-SDAG-NEXT:    s_barrier_init -1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_barrier_init:
@@ -574,8 +540,6 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_barrier_init -1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -603,8 +567,6 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX12-SDAG-NEXT:    s_barrier_init 1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test2_s_barrier_init:
@@ -620,8 +582,6 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_barrier_init 1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -649,8 +609,6 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX12-SDAG-NEXT:    s_barrier_init 0
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test3_s_barrier_init:
@@ -666,8 +624,6 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_barrier_init 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -697,8 +653,6 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar,
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX12-SDAG-NEXT:    s_barrier_init m0
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test4_s_barrier_init:
@@ -715,8 +669,6 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar,
 ; GFX12-GISEL-NEXT:    s_or_b32 m0, s2, s3
 ; GFX12-GISEL-NEXT:    s_barrier_init m0
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -779,8 +731,6 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-SDAG-NEXT:    s_barrier_join -1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_barrier_join:
@@ -795,8 +745,6 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_barrier_join -1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -823,8 +771,6 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-SDAG-NEXT:    s_barrier_join 1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test2_s_barrier_join:
@@ -839,8 +785,6 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_barrier_join 1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -867,8 +811,6 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-SDAG-NEXT:    s_barrier_join 0
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test3_s_barrier_join:
@@ -883,8 +825,6 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_barrier_join 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -911,8 +851,6 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_barrier_join m0
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test4_s_barrier_join_m0:
@@ -928,8 +866,6 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_barrier_join m0
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1003,8 +939,6 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_barrier_leave:
@@ -1027,8 +961,6 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1057,8 +989,6 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-SDAG-NEXT:    s_wakeup_barrier -1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test1_s_wakeup_barrier:
@@ -1073,8 +1003,6 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_wakeup_barrier -1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1101,8 +1029,6 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-SDAG-NEXT:    s_wakeup_barrier 1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test2_s_wakeup_barrier:
@@ -1117,8 +1043,6 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_wakeup_barrier 1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1145,8 +1069,6 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-SDAG-NEXT:    s_wakeup_barrier 0
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test3_s_wakeup_barrier:
@@ -1161,8 +1083,6 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_wakeup_barrier 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1189,8 +1109,6 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v1, s[0:1]
 ; GFX12-SDAG-NEXT:    s_wakeup_barrier m0
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test4_s_wakeup_barrier_m0:
@@ -1206,8 +1124,6 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
 ; GFX12-GISEL-NEXT:    s_wakeup_barrier m0
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1262,8 +1178,6 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1287,8 +1201,6 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1312,8 +1224,6 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1339,8 +1249,6 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1419,8 +1327,6 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-NEXT:    s_barrier_signal -1
 ; GFX12-SDAG-NEXT:    s_barrier_wait -1
 ; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: test_barrier_convert:
@@ -1437,8 +1343,6 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GFX12-GISEL-NEXT:    s_barrier_signal -1
 ; GFX12-GISEL-NEXT:    s_barrier_wait -1
 ; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index d085b3c768a868..1dbcc21500bb94 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -736,8 +736,6 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    exp mrt0 v0, v0, v0, v0 done
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_buffer_load_index_across_bb:
@@ -755,8 +753,6 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    export mrt0 v0, v0, v0, v0 done
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %tmp = shl i32 %index, 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index 363c54d4abe908..a9823d89048bdc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -10,8 +10,6 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_get_doorbell:
@@ -21,8 +19,6 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 128)
   store i32 %ret, ptr addrspace(1) %out
@@ -37,8 +33,6 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_get_ddid:
@@ -48,8 +42,6 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 129)
   store i32 %ret, ptr addrspace(1) %out
@@ -65,8 +57,6 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 130)
   store i64 %ret, ptr addrspace(1) %out
@@ -82,8 +72,6 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 131)
   store i64 %ret, ptr addrspace(1) %out
@@ -98,8 +86,6 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_savewave:
@@ -109,8 +95,6 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 132)
   store i32 %ret, ptr addrspace(1) %out
@@ -126,8 +110,6 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 133)
   store i64 %ret, ptr addrspace(1) %out
@@ -142,8 +124,6 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: test_get_0_i32:
@@ -153,8 +133,6 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0)
   store i32 %ret, ptr addrspace(1) %out
@@ -170,8 +148,6 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 99999)
   store i64 %ret, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
index c99a082afe2deb..60c04749c9b741 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -51,8 +51,6 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_i8_tfe:
@@ -64,8 +62,6 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { i8, i32 } @llvm.amdgcn.struct.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i8, i32 } %res, 0
@@ -119,8 +115,6 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_i16_tfe:
@@ -132,8 +126,6 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { i16, i32 } @llvm.amdgcn.struct.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i16, i32 } %res, 0
@@ -187,8 +179,6 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_f16_tfe:
@@ -200,8 +190,6 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { half, i32 } @llvm.amdgcn.struct.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { half, i32 } %res, 0
@@ -255,8 +243,6 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_i32_tfe:
@@ -268,8 +254,6 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i32, i32 } %res, 0
@@ -343,8 +327,6 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_v2i32_tfe:
@@ -356,8 +338,6 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x i32>, i32 } %res, 0
@@ -431,8 +411,6 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_v2f32_tfe:
@@ -444,8 +422,6 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x float>, i32 } %res, 0
@@ -524,8 +500,6 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_v3i32_tfe:
@@ -538,8 +512,6 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x i32>, i32 } %res, 0
@@ -618,8 +590,6 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_v3f32_tfe:
@@ -632,8 +602,6 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x float>, i32 } %res, 0
@@ -699,8 +667,6 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_v4i32_tfe:
@@ -713,8 +679,6 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x i32>, i32 } %res, 0
@@ -780,8 +744,6 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: struct_buffer_load_v4f32_tfe:
@@ -794,8 +756,6 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %res = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x float>, i32 } %res, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 94c9f7ab8e756a..87b83f68d685de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -19,8 +19,6 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v12, s[0:3], 0 idxen
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
@@ -40,8 +38,6 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
@@ -57,8 +53,6 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
 ; GFX11-LABEL: buffer_store_idx:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
@@ -80,8 +74,6 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
@@ -97,8 +89,6 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32)
 ; GFX11-LABEL: buffer_store_both:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
@@ -116,8 +106,6 @@ define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>,
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
@@ -149,8 +137,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32,
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], 0 idxen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
@@ -168,8 +154,6 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %
 ; GFX11-LABEL: buffer_store_x1:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
@@ -185,8 +169,6 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data,
 ; GFX11-LABEL: buffer_store_x2:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
@@ -209,8 +191,6 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v7, s[0:3], 0 idxen
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc
 ; GFX11-NEXT:    buffer_store_b32 v6, v7, s[0:3], 0 idxen slc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
@@ -230,8 +210,6 @@ define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b8 v0, v1, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
@@ -251,8 +229,6 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %v2 = fptrunc float %v1 to half
   call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
@@ -277,8 +253,6 @@ define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x hal
 ; GFX11-LABEL: struct_buffer_store_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
@@ -306,8 +280,6 @@ define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x hal
 ; GFX11-LABEL: struct_buffer_store_v4f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
@@ -324,8 +296,6 @@ define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1,
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
@@ -351,8 +321,6 @@ define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16
 ; GFX11-LABEL: struct_buffer_store_vif16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
@@ -378,8 +346,6 @@ define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16
 ; GFX11-LABEL: struct_buffer_store_v4i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
index 38fdcf47171aff..d625dc17286a14 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
@@ -46,8 +46,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
@@ -98,8 +96,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
@@ -162,8 +158,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -226,8 +220,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll
index f84422efcf9911..753d17aee5460f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll
@@ -40,8 +40,6 @@ define amdgpu_ps void @tbuffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x f
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[4:7], v12, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] idxen glc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen slc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen glc dlc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -77,8 +75,6 @@ define amdgpu_ps void @tbuffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:117 idxen offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -109,8 +105,6 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(ptr addrspace(8) inreg,
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -137,8 +131,6 @@ define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float> %vda
 ; GFX11-LABEL: buffer_store_idx:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -177,8 +169,6 @@ define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float> %vda
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:115 idxen offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -205,8 +195,6 @@ define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float> %vd
 ; GFX11-LABEL: buffer_store_both:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:70 idxen offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -246,8 +234,6 @@ define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float> %vd
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_8_8_8_8_UINT] idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -277,8 +263,6 @@ define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data
 ; GFX11-LABEL: buffer_store_x1:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast float %data to i32
@@ -305,8 +289,6 @@ define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float>
 ; GFX11-LABEL: buffer_store_x2:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast <2 x float> %data to <2 x i32>
@@ -337,8 +319,6 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(ptr addrspace(8) inreg %
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 4092, i32 0, i32 63, i32 0)
@@ -376,8 +356,6 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(ptr addrspace(8) inreg %
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0x1000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 8188, i32 0, i32 63, i32 0)
@@ -415,8 +393,6 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(ptr addrspace(8) inreg %
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0xf000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 65532, i32 0, i32 63, i32 0)
@@ -454,8 +430,6 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(ptr addrspace(8) inreg %
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7ff000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 8388604, i32 0, i32 63, i32 0)
@@ -493,8 +467,6 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(ptr addrspace(8) inreg %
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0xfff000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f32(<4 x float> %data, ptr addrspace(8) %rsrc, i32 0, i32 16777212, i32 0, i32 63, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 1da076c6523990..e4199e199feb14 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -49,8 +49,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-LABEL: tbuffer_store_d16_x:
@@ -62,8 +60,6 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-NEXT:    s_nop 0
-; GFX12-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
@@ -114,8 +110,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -127,8 +121,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_xy v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-NEXT:    s_nop 0
-; GFX12-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
@@ -191,8 +183,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz:
@@ -206,8 +196,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-SDAG-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-PACKED-SDAG-NEXT:    tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-SDAG-NEXT:    s_nop 0
-; GFX12-PACKED-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz:
@@ -221,8 +209,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-PACKED-GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-GISEL-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-GISEL-NEXT:    s_nop 0
-; GFX12-PACKED-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-GISEL-NEXT:    s_endpgm
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -285,8 +271,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-NEXT:    s_nop 0
-; GFX11-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PACKED-NEXT:    s_endpgm
 ;
 ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -299,8 +283,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX12-PACKED-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-PACKED-NEXT:    tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-NEXT:    s_nop 0
-; GFX12-PACKED-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-PACKED-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
index 47b7658f50cc5a..ab0f18938298f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
@@ -41,8 +41,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>,
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[4:7], v12, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] idxen glc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen slc
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen glc dlc
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: tbuffer_store:
@@ -53,8 +51,6 @@ define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>,
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[4:7], v12, s[0:3], null format:[BUF_FMT_32_32_32_32_UINT] idxen th:TH_STORE_NT
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[8:11], v12, s[0:3], null format:78 idxen th:TH_STORE_HT
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[8:11], v12, s[0:3], null format:78 idxen th:TH_STORE_RT_NT
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -90,16 +86,12 @@ define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:117 idxen offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: tbuffer_store_immoffs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:117 idxen offset:42
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %1 to <4 x i32>
@@ -130,16 +122,12 @@ define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x f
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: tbuffer_store_scalar_and_imm_offs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -166,15 +154,11 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32
 ; GFX11-LABEL: buffer_store_idx:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_idx:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_8_8_8_8_SINT] idxen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -213,16 +197,12 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:115 idxen offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_ofs:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], null format:115 idxen offen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -249,15 +229,11 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i3
 ; GFX11-LABEL: buffer_store_both:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:70 idxen offen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_both:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], null format:70 idxen offen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -297,8 +273,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3
 ; GFX11-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_8_8_8_8_UINT] idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_wait:
@@ -307,8 +281,6 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3
 ; GFX12-NEXT:    buffer_load_format_xyzw v[0:3], v5, s[0:3], null idxen
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v6, s[0:3], null format:[BUF_FMT_8_8_8_8_UINT] idxen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %in1 = bitcast <4 x float> %vdata to <4 x i32>
@@ -338,15 +310,11 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %
 ; GFX11-LABEL: buffer_store_x1:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_x1:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_x v0, v1, s[0:3], null format:125 idxen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast float %data to i32
@@ -373,15 +341,11 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data,
 ; GFX11-LABEL: buffer_store_x2:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_x2:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    tbuffer_store_format_xy v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   %data.i = bitcast <2 x float> %data to <2 x i32>
@@ -412,16 +376,12 @@ define amdgpu_ps void @buffer_store_voffset_large_12bit(<4 x i32> inreg %rsrc, <
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_12bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 4092, i32 0, i32 63, i32 0)
@@ -459,16 +419,12 @@ define amdgpu_ps void @buffer_store_voffset_large_13bit(<4 x i32> inreg %rsrc, <
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0x1000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_13bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8188
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 8188, i32 0, i32 63, i32 0)
@@ -506,16 +462,12 @@ define amdgpu_ps void @buffer_store_voffset_large_16bit(<4 x i32> inreg %rsrc, <
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0xf000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_16bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:65532
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 65532, i32 0, i32 63, i32 0)
@@ -553,16 +505,12 @@ define amdgpu_ps void @buffer_store_voffset_large_23bit(<4 x i32> inreg %rsrc, <
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7ff000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_23bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:8388604
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 8388604, i32 0, i32 63, i32 0)
@@ -600,16 +548,12 @@ define amdgpu_ps void @buffer_store_voffset_large_24bit(<4 x i32> inreg %rsrc, <
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0xfff000 :: v_dual_mov_b32 v4, s4
 ; GFX11-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: buffer_store_voffset_large_24bit:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    v_dual_mov_b32 v5, 0x800000 :: v_dual_mov_b32 v4, 0
 ; GFX12-NEXT:    tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], null format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:8388604
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %data, <4 x i32> %rsrc, i32 0, i32 16777212, i32 0, i32 63, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index 5ea89bc5749108..d44d2a4f73389c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -23,8 +23,6 @@ define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %waveid = call i32 @llvm.amdgcn.wave.id()
   store i32 %waveid, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index 3d5e6dca45618a..3874a456590dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -19,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
@@ -37,8 +35,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
@@ -55,8 +51,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
@@ -71,8 +65,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
@@ -91,8 +83,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
@@ -118,8 +108,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
@@ -138,8 +126,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
@@ -154,8 +140,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[24:25], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
@@ -174,8 +158,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
@@ -201,8 +183,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
 ; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
@@ -221,8 +201,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -237,8 +215,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -253,8 +229,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -269,8 +243,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -285,8 +257,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -301,8 +271,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -317,8 +285,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -333,8 +299,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -351,8 +315,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -367,8 +329,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -383,8 +343,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -399,8 +357,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -416,8 +372,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -432,8 +386,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -448,8 +400,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -464,8 +414,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index d11aaf0036b3ab..25adc25d717680 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -17,8 +17,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
@@ -33,8 +31,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
@@ -49,8 +45,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
@@ -63,8 +57,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
@@ -79,8 +71,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
@@ -102,8 +92,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
@@ -120,8 +108,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
@@ -134,8 +120,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
 ; W64-NEXT:    global_store_b128 v[20:21], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
@@ -150,8 +134,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
@@ -173,8 +155,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
 ; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
 ; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
@@ -191,8 +171,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -206,8 +184,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -220,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -234,8 +208,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -248,8 +220,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -262,8 +232,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -276,8 +244,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -290,8 +256,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -306,8 +270,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -320,8 +282,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -334,8 +294,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -348,8 +306,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -362,8 +318,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -376,8 +330,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -390,8 +342,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -404,8 +354,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
index d51efc2b275db9..0418f32c3f7316 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
@@ -64,8 +64,6 @@ define amdgpu_cs void @test_s_wqm_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_wqm_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask)
@@ -145,8 +143,6 @@ define amdgpu_cs void @test_s_wqm_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_wqm_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 9d93ca65683c42..837d484583d53f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -48,8 +48,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
 ; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
@@ -88,8 +86,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
@@ -145,8 +141,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s2
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i64:
@@ -196,8 +190,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s2
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i64, ptr addrspace(1) %out
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
@@ -253,8 +245,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s2
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_f64:
@@ -304,8 +294,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s2
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load double, ptr addrspace(1) %out
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
@@ -354,8 +342,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 32, s2
 ; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32:
@@ -398,8 +384,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 32, s2
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 32, i32 %src1, i32 %oldval)
@@ -454,8 +438,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 0, s4
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 32, s4
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64:
@@ -504,8 +486,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 32, s4
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 0, s4
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i64, ptr addrspace(1) %out
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 32, i32 %src1, i64 %oldval)
@@ -565,8 +545,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s2, s4
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 0, s4
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64:
@@ -619,8 +597,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, 0, s4
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s2, s4
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load double, ptr addrspace(1) %out
   %writelane = call double @llvm.amdgcn.writelane.f64(double 32.0, i32 %src1, double %oldval)
@@ -686,8 +662,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 12, s2
 ; GFX1100-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32:
@@ -748,8 +722,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
 ; GFX1100-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
@@ -825,8 +797,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, 0, s3
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 12, s3
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64:
@@ -890,8 +860,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
 ; GFX1100-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
@@ -972,8 +940,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s2, s3
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, 0, s3
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64:
@@ -1041,8 +1007,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, 0, s2
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v2, s3, s2
 ; GFX1100-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <2 x double>, ptr addrspace(1) %in, i32 %tid
@@ -1108,8 +1072,6 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, m0, s2
 ; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32:
@@ -1163,8 +1125,6 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, m0, s2
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
@@ -1214,8 +1174,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
 ; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_i32:
@@ -1258,8 +1216,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i32, ptr addrspace(1) %out
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 32, i32 %oldval) #0
@@ -1309,8 +1265,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_i64:
@@ -1354,8 +1308,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load i64, ptr addrspace(1) %out
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 32, i64 %oldval) #0
@@ -1405,8 +1357,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_f64:
@@ -1450,8 +1400,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %oldval = load double, ptr addrspace(1) %out
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 32, double %oldval) #0
@@ -1495,8 +1443,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX1100-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
 ; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32:
@@ -1534,8 +1480,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, ptr addrspace(1) %out, align 4
@@ -1587,8 +1531,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64:
@@ -1635,8 +1577,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s0, s2
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s1, s2
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 %oldval)
   store i64 %writelane, ptr addrspace(1) %out, align 4
@@ -1688,8 +1628,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s1, s2
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s0, s2
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64:
@@ -1736,8 +1674,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s0, s2
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s1, s2
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double %oldval)
   store double %writelane, ptr addrspace(1) %out, align 4
@@ -1775,8 +1711,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s2, s3
 ; GFX1100-SDAG-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
@@ -1809,8 +1743,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s2, s3
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i32 @llvm.amdgcn.writelane.i32(i32 %src0, i32 %src1, i32 42)
   store i32 %writelane, ptr addrspace(1) %out, align 4
@@ -1859,8 +1791,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s0
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s0
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64:
@@ -1904,8 +1834,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s0
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s0
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call i64 @llvm.amdgcn.writelane.i64(i64 %src0, i32 %src1, i64 42)
   store i64 %writelane, ptr addrspace(1) %out, align 4
@@ -1954,8 +1882,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v1, s7, s0
 ; GFX1100-SDAG-NEXT:    v_writelane_b32 v0, s6, s0
 ; GFX1100-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64:
@@ -1999,8 +1925,6 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v0, s6, s0
 ; GFX1100-GISEL-NEXT:    v_writelane_b32 v1, s7, s0
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
   %writelane = call double @llvm.amdgcn.writelane.f64(double %src0, i32 %src1, double 42.0)
   store double %writelane, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 4faa482ede59f7..42a59ec7bccb45 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -62,8 +62,6 @@ define amdgpu_kernel void @ceil_f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_ceil_f16_e32 v0.l, v0.l
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: ceil_f16:
@@ -82,8 +80,6 @@ define amdgpu_kernel void @ceil_f16(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_ceil_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -171,8 +167,6 @@ define amdgpu_kernel void @ceil_v2f16(
 ; GFX11-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: ceil_v2f16:
@@ -195,8 +189,6 @@ define amdgpu_kernel void @ceil_v2f16(
 ; GFX11-FAKE16-NEXT:    v_ceil_f16_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 0b18e5f35a3107..62bdef89b5a0bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -79,8 +79,6 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cos_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.cos.f16(half %a.val)
@@ -189,8 +187,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a.val = load <2 x half>, ptr addrspace(1) %a
   %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 61f6c9f7f0e6ff..6036150e189054 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -62,8 +62,6 @@ define amdgpu_kernel void @floor_f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_floor_f16_e32 v0.l, v0.l
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: floor_f16:
@@ -82,8 +80,6 @@ define amdgpu_kernel void @floor_f16(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -172,8 +168,6 @@ define amdgpu_kernel void @floor_v2f16(
 ; GFX11-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: floor_v2f16:
@@ -196,8 +190,6 @@ define amdgpu_kernel void @floor_v2f16(
 ; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 92658e660ea676..10f87e74f39d81 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -189,8 +189,6 @@ define amdgpu_kernel void @fmuladd_f16(
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-FLUSH-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_f16:
@@ -219,8 +217,6 @@ define amdgpu_kernel void @fmuladd_f16(
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-DENORM-NEXT:    buffer_store_b16 v2, off, s[8:11], 0
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -381,8 +377,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-FLUSH-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a:
@@ -407,8 +401,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; GFX11-DENORM-NEXT:    s_mov_b32 s9, s5
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
 ; GFX11-DENORM-NEXT:    buffer_store_b16 v1, off, s[8:11], 0
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b,
@@ -567,8 +559,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-FLUSH-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b:
@@ -593,8 +583,6 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; GFX11-DENORM-NEXT:    s_mov_b32 s9, s5
 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
 ; GFX11-DENORM-NEXT:    buffer_store_b16 v1, off, s[8:11], 0
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -807,8 +795,6 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-FLUSH-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-FLUSH-NEXT:    s_nop 0
-; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FLUSH-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-LABEL: fmuladd_v2f16:
@@ -837,8 +823,6 @@ define amdgpu_kernel void @fmuladd_v2f16(
 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
 ; GFX11-DENORM-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-DENORM-NEXT:    s_nop 0
-; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
index 2bb4cc617e7f17..ccd30d3d8bea68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
@@ -147,8 +147,6 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %fpmode = call i32 @llvm.get.fpmode.i32()
   store i32 %fpmode, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index f86c8294ab3c00..86311ab859258d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -79,8 +79,6 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
 ; GFX11CHECK-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; GFX11CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11CHECK-NEXT:    s_nop 0
-; GFX11CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11CHECK-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3)
   %sext = sext i1 %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 3d8e9e60973053..a577fb3d190ab9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -86,8 +86,6 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s2, s4, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; GFX11CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11CHECK-NEXT:    s_nop 0
-; GFX11CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11CHECK-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3)
   %sext = sext i1 %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index e05f3f1e65ff3a..96551d5bf78539 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -81,8 +81,6 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; GFX11CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11CHECK-NEXT:    s_nop 0
-; GFX11CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11CHECK-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f32(float %x, i32 3)  ; nan
   %sext = sext i1 %result to i32
@@ -166,8 +164,6 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; GFX11CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11CHECK-NEXT:    s_nop 0
-; GFX11CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11CHECK-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f64(double %x, i32 3)  ; nan
   %sext = sext i1 %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 7f4cf19e9b85b4..8f28208945fbcd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -208,8 +208,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_f32:
@@ -236,8 +234,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_f32:
@@ -595,8 +591,6 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
 ; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v2f32:
@@ -630,8 +624,6 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_v2f32:
@@ -1136,8 +1128,6 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v6
 ; GFX1100-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v3f32:
@@ -1192,8 +1182,6 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
 ; GFX1100-GISEL-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_v3f32:
@@ -1826,8 +1814,6 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
 ; GFX1100-SDAG-NEXT:    global_store_b128 v7, v[0:3], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log_v4f32:
@@ -1888,8 +1874,6 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 1c64e6b76c9577..e80d8e3bfb386e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -208,8 +208,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_f32:
@@ -236,8 +234,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_f32:
@@ -595,8 +591,6 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5
 ; GFX1100-SDAG-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v2f32:
@@ -630,8 +624,6 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_v2f32:
@@ -1136,8 +1128,6 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v3, v6
 ; GFX1100-SDAG-NEXT:    global_store_b96 v4, v[0:2], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v3f32:
@@ -1192,8 +1182,6 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
 ; GFX1100-GISEL-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_v3f32:
@@ -1826,8 +1814,6 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
 ; GFX1100-SDAG-NEXT:    global_store_b128 v7, v[0:3], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log10_v4f32:
@@ -1888,8 +1874,6 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 50c52037dc4d31..6578311178ab75 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -142,8 +142,6 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log2_f32:
@@ -162,8 +160,6 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_f32:
@@ -379,8 +375,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2
 ; GFX1100-SDAG-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v2f32:
@@ -403,8 +397,6 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v2f32:
@@ -686,8 +678,6 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    global_store_b96 v6, v[0:2], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v3f32:
@@ -719,8 +709,6 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
 ; GFX1100-GISEL-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v3f32:
@@ -1065,8 +1053,6 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-SDAG-NEXT:    v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
 ; GFX1100-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    global_store_b128 v9, v[0:3], s[0:1]
-; GFX1100-SDAG-NEXT:    s_nop 0
-; GFX1100-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v4f32:
@@ -1103,8 +1089,6 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX1100-GISEL-NEXT:    s_nop 0
-; GFX1100-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 2c9ce001b8c4d9..1daa45285e68aa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -138,8 +138,6 @@ define amdgpu_kernel void @maxnum_f16(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -248,8 +246,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, 0x4200, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) #0 {
@@ -356,8 +352,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, 4.0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -469,8 +463,6 @@ define amdgpu_kernel void @maxnum_v2f16(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -561,8 +553,6 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) #0 {
@@ -651,8 +641,6 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -787,8 +775,6 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:4
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -934,8 +920,6 @@ define amdgpu_kernel void @maxnum_v4f16(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v3, v2
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1053,8 +1037,6 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
 ; GFX11-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 59508e049a3a91..00cba8c77fc78f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -137,8 +137,6 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -274,8 +272,6 @@ define amdgpu_kernel void @minnum_f16_imm_a(
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0x4200, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) #0 {
@@ -381,8 +377,6 @@ define amdgpu_kernel void @minnum_f16_imm_b(
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_min_f16_e32 v0, 4.0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -493,8 +487,6 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_min_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -619,8 +611,6 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) #0 {
@@ -708,8 +698,6 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
@@ -843,8 +831,6 @@ define amdgpu_kernel void @minnum_v3f16(
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:4
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -989,8 +975,6 @@ define amdgpu_kernel void @minnum_v4f16(
 ; GFX11-NEXT:    v_pk_min_f16 v1, v1, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v3, v2
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1107,8 +1091,6 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
 ; GFX11-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 2d3b18dcb121cd..7e3158bd1106ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -448,8 +448,6 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: umulo_i64_s:
@@ -476,8 +474,6 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; GFX12-NEXT:    s_cselect_b32 s1, 0, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
@@ -661,8 +657,6 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: smulo_i64_s:
@@ -703,8 +697,6 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index bf7dbcde62fdd0..4fc401ff20ac3b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -80,8 +80,6 @@ define amdgpu_kernel void @rint_f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -187,8 +185,6 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX11-NEXT:    v_rndne_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index e7b17c30cf7535..5347f0f00c3157 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -78,8 +78,6 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_f32:
@@ -209,8 +207,6 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_add_f32_e32 v0, v2, v3
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_v2f32:
@@ -393,8 +389,6 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
 ; GFX11-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, s4
 ; GFX11-NEXT:    v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_v4f32:
@@ -709,8 +703,6 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_v8f32:
@@ -853,8 +845,6 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_f16:
@@ -1007,8 +997,6 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 70f15bd0aa6131..d618b937910276 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1779,8 +1779,6 @@ define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   tail call void @llvm.set.rounding(i32 1)
   %set.mode = tail call i32 @llvm.get.rounding()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 3ae0cf65eb00f2..a177a61823dc4d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -79,8 +79,6 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sin_f16_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.sin.f16(half %a.val)
@@ -189,8 +187,6 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a.val = load <2 x half>, ptr addrspace(1) %a
   %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index c69ebedbec50b5..21c4455565db80 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -61,8 +61,6 @@ define amdgpu_kernel void @sqrt_f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -148,8 +146,6 @@ define amdgpu_kernel void @sqrt_v2f16(
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 11f5e6ebf99980..623db04a0e90d7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -61,8 +61,6 @@ define amdgpu_kernel void @trunc_f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_trunc_f16_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -148,8 +146,6 @@ define amdgpu_kernel void @trunc_v2f16(
 ; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
index 668ebe3f953b32..b88266981a2533 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
@@ -22,8 +22,6 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or3_b32 v2, s2, v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_uniform_load_b96:
@@ -43,8 +41,6 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_or3_b32 v2, v2, s1, s2
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %i = zext i32 %arg to i64
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
index 029c4e51e29934..1afd1786569b6e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
@@ -76,8 +76,6 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur
 ; GFX12-NEXT:    s_add_f32 s0, s7, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[10:11]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %out_ptr.promoted = load float, ptr addrspace(1) %out_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 7202ab8b314669..e8115a3db557e7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -54,8 +54,6 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load double, ptr addrspace(4) %in
   store double %ld, ptr addrspace(1) %out
@@ -155,8 +153,6 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_f64_e32 v[0:1], s[14:15], v[0:1]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[18:19]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %out_ptr.promoted = load double, ptr addrspace(1) %out_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 32abe50ff04d81..d8b4fadeebba73 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -73,8 +73,6 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load i1, ptr addrspace(4) %in
   store i1 %load, ptr addrspace(1) %out
@@ -146,8 +144,6 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   store <2 x i1> %load, ptr addrspace(1) %out
@@ -218,8 +214,6 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   store <3 x i1> %load, ptr addrspace(1) %out
@@ -291,8 +285,6 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   store <4 x i1> %load, ptr addrspace(1) %out
@@ -364,8 +356,6 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   store <8 x i1> %load, ptr addrspace(1) %out
@@ -437,8 +427,6 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   store <16 x i1> %load, ptr addrspace(1) %out
@@ -494,8 +482,6 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i1>, ptr addrspace(4) %in
   store <32 x i1> %load, ptr addrspace(1) %out
@@ -554,8 +540,6 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i1>, ptr addrspace(4) %in
   store <64 x i1> %load, ptr addrspace(1) %out
@@ -617,8 +601,6 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i1, ptr addrspace(4) %in
   %ext = zext i1 %a to i32
@@ -686,8 +668,6 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i1, ptr addrspace(4) %in
   %ext = sext i1 %a to i32
@@ -750,8 +730,6 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = zext <1 x i1> %load to <1 x i32>
@@ -819,8 +797,6 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = sext <1 x i1> %load to <1 x i32>
@@ -894,8 +870,6 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = zext <2 x i1> %load to <2 x i32>
@@ -969,8 +943,6 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = sext <2 x i1> %load to <2 x i32>
@@ -1054,8 +1026,6 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_bfe_u32 v1, v1, 1, 1
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = zext <3 x i1> %load to <3 x i32>
@@ -1138,8 +1108,6 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = sext <3 x i1> %load to <3 x i32>
@@ -1223,8 +1191,6 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = zext <4 x i1> %load to <4 x i32>
@@ -1311,8 +1277,6 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = sext <4 x i1> %load to <4 x i32>
@@ -1438,8 +1402,6 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = zext <8 x i1> %load to <8 x i32>
@@ -1561,8 +1523,6 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = sext <8 x i1> %load to <8 x i32>
@@ -1762,8 +1722,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = zext <16 x i1> %load to <16 x i32>
@@ -1957,8 +1915,6 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = sext <16 x i1> %load to <16 x i32>
@@ -2328,8 +2284,6 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = zext <32 x i1> %load to <32 x i32>
@@ -2732,8 +2686,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = sext <32 x i1> %load to <32 x i32>
@@ -3436,8 +3388,6 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = zext <64 x i1> %load to <64 x i32>
@@ -4200,8 +4150,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = sext <64 x i1> %load to <64 x i32>
@@ -4271,8 +4219,6 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i1, ptr addrspace(4) %in
   %ext = zext i1 %a to i64
@@ -4343,8 +4289,6 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i1, ptr addrspace(4) %in
   %ext = sext i1 %a to i64
@@ -4414,8 +4358,6 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = zext <1 x i1> %load to <1 x i64>
@@ -4486,8 +4428,6 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = sext <1 x i1> %load to <1 x i64>
@@ -4570,8 +4510,6 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = zext <2 x i1> %load to <2 x i64>
@@ -4656,8 +4594,6 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = sext <2 x i1> %load to <2 x i64>
@@ -4761,8 +4697,6 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b64 v5, v[4:5], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = zext <3 x i1> %load to <3 x i64>
@@ -4873,8 +4807,6 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = sext <3 x i1> %load to <3 x i64>
@@ -4991,8 +4923,6 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = zext <4 x i1> %load to <4 x i64>
@@ -5116,8 +5046,6 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = sext <4 x i1> %load to <4 x i64>
@@ -5282,8 +5210,6 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v1, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = zext <8 x i1> %load to <8 x i64>
@@ -5489,8 +5415,6 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = sext <8 x i1> %load to <8 x i64>
@@ -5781,8 +5705,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:112
 ; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:64
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = zext <16 x i1> %load to <16 x i64>
@@ -6144,8 +6066,6 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = sext <16 x i1> %load to <16 x i64>
@@ -6675,8 +6595,6 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = zext <32 x i1> %load to <32 x i64>
@@ -7385,8 +7303,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = sext <32 x i1> %load to <32 x i64>
@@ -8415,8 +8331,6 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = zext <64 x i1> %load to <64 x i64>
@@ -9805,8 +9719,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = sext <64 x i1> %load to <64 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 2ee1c60b4bbf2d..7b2ccb60d142b1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -83,8 +83,6 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load i16, ptr addrspace(4) %in
@@ -153,8 +151,6 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <2 x i16>, ptr addrspace(4) %in
@@ -261,8 +257,6 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1] offset:4
 ; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i16>, ptr addrspace(4) %in
@@ -335,8 +329,6 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <4 x i16>, ptr addrspace(4) %in
@@ -416,8 +408,6 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <8 x i16>, ptr addrspace(4) %in
@@ -534,8 +524,6 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <16 x i16>, ptr addrspace(4) %in
@@ -774,8 +762,6 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld =  load <16 x i16>, ptr addrspace(4) %ptr0, align 2
@@ -851,8 +837,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(4) %in
   %ext = zext i16 %a to i32
@@ -929,8 +913,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(4) %in
   %ext = sext i16 %a to i32
@@ -1006,8 +988,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = zext <1 x i16> %load to <1 x i32>
@@ -1084,8 +1064,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = sext <1 x i16> %load to <1 x i32>
@@ -1170,8 +1148,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = zext <2 x i16> %load to <2 x i32>
@@ -1258,8 +1234,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = sext <2 x i16> %load to <2 x i32>
@@ -1356,8 +1330,6 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i16>, ptr addrspace(4) %in
@@ -1458,8 +1430,6 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s3
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i16>, ptr addrspace(4) %in
@@ -1566,8 +1536,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = zext <4 x i16> %load to <4 x i32>
@@ -1675,8 +1643,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = sext <4 x i16> %load to <4 x i32>
@@ -1835,8 +1801,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = zext <8 x i16> %load to <8 x i32>
@@ -1997,8 +1961,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = sext <8 x i16> %load to <8 x i32>
@@ -2261,8 +2223,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = zext <16 x i16> %load to <16 x i32>
@@ -2529,8 +2489,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = sext <16 x i16> %load to <16 x i32>
@@ -3010,8 +2968,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = zext <32 x i16> %load to <32 x i32>
@@ -3501,8 +3457,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = sext <32 x i16> %load to <32 x i32>
@@ -4424,8 +4378,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[36:37] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[36:37] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[36:37]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i16>, ptr addrspace(4) %in
   %ext = zext <64 x i16> %load to <64 x i32>
@@ -5363,8 +5315,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[36:37] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[36:37] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[36:37]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i16>, ptr addrspace(4) %in
   %ext = sext <64 x i16> %load to <64 x i32>
@@ -5446,8 +5396,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(4) %in
   %ext = zext i16 %a to i64
@@ -5538,8 +5486,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(4) %in
   %ext = sext i16 %a to i64
@@ -5621,8 +5567,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = zext <1 x i16> %load to <1 x i64>
@@ -5708,8 +5652,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = sext <1 x i16> %load to <1 x i64>
@@ -5803,8 +5745,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = zext <2 x i16> %load to <2 x i64>
@@ -5903,8 +5843,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = sext <2 x i16> %load to <2 x i64>
@@ -6036,8 +5974,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = zext <4 x i16> %load to <4 x i64>
@@ -6186,8 +6122,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = sext <4 x i16> %load to <4 x i64>
@@ -6397,8 +6331,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = zext <8 x i16> %load to <8 x i64>
@@ -6645,8 +6577,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = sext <8 x i16> %load to <8 x i64>
@@ -7013,8 +6943,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = zext <16 x i16> %load to <16 x i64>
@@ -7462,8 +7390,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = sext <16 x i16> %load to <16 x i64>
@@ -8149,8 +8075,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = zext <32 x i16> %load to <32 x i64>
@@ -9005,8 +8929,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = sext <32 x i16> %load to <32 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 4ab55164e09992..72d19a1475682a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -79,8 +79,6 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load i32, ptr addrspace(4) %in
@@ -165,8 +163,6 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <2 x i32>, ptr addrspace(4) %in
@@ -262,8 +258,6 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i32>, ptr addrspace(4) %in
@@ -357,8 +351,6 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <4 x i32>, ptr addrspace(4) %in
@@ -489,8 +481,6 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <8 x i32>, ptr addrspace(4) %in
@@ -651,8 +641,6 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    global_store_b32 v8, v9, s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <9 x i32>, ptr addrspace(4) %in
@@ -819,8 +807,6 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    global_store_b64 v10, v[8:9], s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v10, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v10, v[4:7], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <10 x i32>, ptr addrspace(4) %in
@@ -998,8 +984,6 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    global_store_b96 v11, v[8:10], s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v11, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v11, v[4:7], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <11 x i32>, ptr addrspace(4) %in
@@ -1175,8 +1159,6 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    global_store_b128 v12, v[0:3], s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v12, v[4:7], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v12, v[8:11], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <12 x i32>, ptr addrspace(4) %in
@@ -1383,8 +1365,6 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <16 x i32>, ptr addrspace(4) %in
@@ -1468,8 +1448,6 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load i32, ptr addrspace(4) %in
   %ext = zext i32 %ld to i64
@@ -1561,8 +1539,6 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load i32, ptr addrspace(4) %in
   %ext = sext i32 %ld to i64
@@ -1646,8 +1622,6 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <1 x i32>, ptr addrspace(4) %in
   %ext = zext <1 x i32> %ld to <1 x i64>
@@ -1739,8 +1713,6 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <1 x i32>, ptr addrspace(4) %in
   %ext = sext <1 x i32> %ld to <1 x i64>
@@ -1837,8 +1809,6 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <2 x i32>, ptr addrspace(4) %in
   %ext = zext <2 x i32> %ld to <2 x i64>
@@ -1951,8 +1921,6 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <2 x i32>, ptr addrspace(4) %in
   %ext = sext <2 x i32> %ld to <2 x i64>
@@ -2081,8 +2049,6 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <4 x i32>, ptr addrspace(4) %in
   %ext = zext <4 x i32> %ld to <4 x i64>
@@ -2242,8 +2208,6 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <4 x i32>, ptr addrspace(4) %in
   %ext = sext <4 x i32> %ld to <4 x i64>
@@ -2441,8 +2405,6 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <8 x i32>, ptr addrspace(4) %in
   %ext = zext <8 x i32> %ld to <8 x i64>
@@ -2709,8 +2671,6 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <8 x i32>, ptr addrspace(4) %in
   %ext = sext <8 x i32> %ld to <8 x i64>
@@ -3189,8 +3149,6 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v28, v[20:23], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v28, v[24:27], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v28, v[0:3], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <16 x i32>, ptr addrspace(4) %in
   %ext = sext <16 x i32> %ld to <16 x i64>
@@ -3532,8 +3490,6 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <16 x i32>, ptr addrspace(4) %in
   %ext = zext <16 x i32> %ld to <16 x i64>
@@ -4448,8 +4404,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[36:37] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[36:37] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[36:37]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <32 x i32>, ptr addrspace(4) %in
   %ext = sext <32 x i32> %ld to <32 x i64>
@@ -5087,8 +5041,6 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[36:37]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <32 x i32>, ptr addrspace(4) %in
   %ext = zext <32 x i32> %ld to <32 x i64>
@@ -5459,8 +5411,6 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[36:37] offset:32
 ; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[36:37] offset:16
 ; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[36:37]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load <32 x i32>, ptr addrspace(4) %in
   store <32 x i32> %ld, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 46c7c2b08cd64b..7fd5909f3b2b1c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -70,8 +70,6 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load i64, ptr addrspace(4) %in
   store i64 %ld, ptr addrspace(1) %out
@@ -150,8 +148,6 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <2 x i64>, ptr addrspace(4) %in
@@ -266,8 +262,6 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i64>, ptr addrspace(4) %in
@@ -384,8 +378,6 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <4 x i64>, ptr addrspace(4) %in
@@ -576,8 +568,6 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <8 x i64>, ptr addrspace(4) %in
@@ -931,8 +921,6 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[36:37] offset:32
 ; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[36:37] offset:16
 ; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[36:37]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <16 x i64>, ptr addrspace(4) %in
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index ce17c81a24dd50..f14d4afbee9d97 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -84,8 +84,6 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load i8, ptr addrspace(4) %in
@@ -171,8 +169,6 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <2 x i8>, ptr addrspace(4) %in
@@ -286,8 +282,6 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_d16_hi_b8 v0, v1, s[0:1] offset:2
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i8>, ptr addrspace(4) %in
@@ -356,8 +350,6 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <4 x i8>, ptr addrspace(4) %in
@@ -430,8 +422,6 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <8 x i8>, ptr addrspace(4) %in
@@ -511,8 +501,6 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <16 x i8>, ptr addrspace(4) %in
@@ -588,8 +576,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i32
@@ -666,8 +652,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ld = load i8, ptr addrspace(4) %in
   %ext = sext i8 %ld to i32
@@ -743,8 +727,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = zext <1 x i8> %load to <1 x i32>
@@ -821,8 +803,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i32>
@@ -922,8 +902,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i32>
@@ -1021,8 +999,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i32>
@@ -1123,8 +1099,6 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s2
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i8>, ptr addrspace(4) %in
@@ -1226,8 +1200,6 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ld = load <3 x i8>, ptr addrspace(4) %in
@@ -1333,8 +1305,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = zext <4 x i8> %load to <4 x i32>
@@ -1442,8 +1412,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = sext <4 x i8> %load to <4 x i32>
@@ -1603,8 +1571,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = zext <8 x i8> %load to <8 x i32>
@@ -1767,8 +1733,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = sext <8 x i8> %load to <8 x i32>
@@ -2035,8 +1999,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = zext <16 x i8> %load to <16 x i32>
@@ -2311,8 +2273,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = sext <16 x i8> %load to <16 x i32>
@@ -2795,8 +2755,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = zext <32 x i8> %load to <32 x i32>
@@ -3302,8 +3260,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = sext <32 x i8> %load to <32 x i32>
@@ -4221,8 +4177,6 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i8>, ptr addrspace(4) %in
   %ext = zext <64 x i8> %load to <64 x i32>
@@ -5180,8 +5134,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <64 x i8>, ptr addrspace(4) %in
   %ext = sext <64 x i8> %load to <64 x i32>
@@ -5263,8 +5215,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i64
@@ -5351,8 +5301,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = sext i8 %a to i64
@@ -5432,8 +5380,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = zext <1 x i8> %load to <1 x i64>
@@ -5520,8 +5466,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i64>
@@ -5628,8 +5572,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i64>
@@ -5742,8 +5684,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i64>
@@ -5877,8 +5817,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = zext <4 x i8> %load to <4 x i64>
@@ -6031,8 +5969,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = sext <4 x i8> %load to <4 x i64>
@@ -6245,8 +6181,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = zext <8 x i8> %load to <8 x i64>
@@ -6502,8 +6436,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = sext <8 x i8> %load to <8 x i64>
@@ -6875,8 +6807,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = zext <16 x i8> %load to <16 x i64>
@@ -7339,8 +7269,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = sext <16 x i8> %load to <16 x i64>
@@ -8034,8 +7962,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = zext <32 x i8> %load to <32 x i64>
@@ -8933,8 +8859,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = sext <32 x i8> %load to <32 x i64>
@@ -9035,8 +8959,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i16
@@ -9123,8 +9045,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = sext i8 %a to i16
@@ -9209,8 +9129,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = zext <1 x i8> %load to <1 x i16>
@@ -9297,8 +9215,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i16>
@@ -9396,8 +9312,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i16>
@@ -9512,8 +9426,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i16>
@@ -9645,8 +9557,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = zext <4 x i8> %load to <4 x i16>
@@ -9797,8 +9707,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = sext <4 x i8> %load to <4 x i16>
@@ -9995,8 +9903,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = zext <8 x i8> %load to <8 x i16>
@@ -10231,8 +10137,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = sext <8 x i8> %load to <8 x i16>
@@ -10576,8 +10480,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = zext <16 x i8> %load to <16 x i16>
@@ -11001,8 +10903,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = sext <16 x i8> %load to <16 x i16>
@@ -11638,8 +11538,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = zext <32 x i8> %load to <32 x i16>
@@ -12439,8 +12337,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:32
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = sext <32 x i8> %load to <32 x i16>
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 08b089a32d1d75..22bb01ba2be116 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7103,8 +7103,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_ds_fadd:
@@ -7279,8 +7277,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: local_ds_fadd:
@@ -7971,8 +7967,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: local_ds_fadd_one_as:
@@ -8138,8 +8132,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: local_ds_fadd_one_as:
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index df954f6f940c8c..f2dcd151e5b59d 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -71,8 +71,6 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  .LBB1_3: ; %for.end
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 entry:
   %cmp6.not = icmp eq i32 %n, 0
@@ -118,8 +116,6 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  .LBB2_3: ; %for.end
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GCN-NEXT:    s_endpgm
 entry:
   %cmp6.not = icmp eq i32 %n, 0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index acba2841a71073..79dfabe3b5450e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -54,8 +54,6 @@ define amdgpu_kernel void @workgroup_ids_kernel() {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: workgroup_ids_kernel:
@@ -66,8 +64,6 @@ define amdgpu_kernel void @workgroup_ids_kernel() {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 .entry:
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 1da05ed264a647..356439bad8f008 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -12,6 +12,16 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 ; GFX9-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX9-SDAG-LABEL: _amdgpu_cs_main:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: _amdgpu_cs_main:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
 ; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main:
 ; GFX9ARCH-SDAG:       ; %bb.0: ; %.entry
 ; GFX9ARCH-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
@@ -40,8 +50,6 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: _amdgpu_cs_main:
@@ -52,8 +60,6 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 .entry:
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -67,6 +73,38 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 }
 
 define amdgpu_cs void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s0
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, callee at abs32@hi
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, callee at abs32@lo
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, callee at abs32@lo
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, callee at abs32@hi
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
 ; GFX9ARCH-SDAG-LABEL: caller:
 ; GFX9ARCH-SDAG:       ; %bb.0:
 ; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[8:9]
@@ -129,20 +167,20 @@ declare amdgpu_gfx void @callee(i32)
 
 define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
 ; GFX9-SDAG-LABEL: workgroup_ids_gfx:
-; GFX9-SDAG:           ; %bb.0:
-; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:        global_store_dword v[0:1], v0, off
-; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:        global_store_dword v[2:3], v0, off
-; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:        global_store_dword v[4:5], v0, off
-; GFX9-SDAG-NEXT:        s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:        s_setpc_b64 s[30:31]
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: workgroup_ids_gfx:
-; GFX9-GISEL:          ; %bb.0:
-; GFX9-GISEL-NEXT:       s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:       s_setpc_b64 s[30:31]
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ARCH-SDAG-LABEL: workgroup_ids_gfx:
 ; GFX9ARCH-SDAG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index a4bde5c9d82153..0abe0f91dc0bf9 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -68,8 +68,6 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = lshr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, ptr addrspace(1) %out
@@ -151,8 +149,6 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -250,8 +246,6 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -347,8 +341,6 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -434,8 +426,6 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -517,8 +507,6 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -618,8 +606,6 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -711,8 +697,6 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index a3e7bf2caf7724..48dc95312f5995 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -80,8 +80,6 @@ define amdgpu_kernel void @mad_u16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u16 v0, v1, v2, v0
 ; GFX11-NEXT:    global_store_b16 v3, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 8eb0a46cc8b17f..85b4fd0602f14c 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -968,8 +968,6 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mad_i64_i32_uniform:
@@ -988,8 +986,6 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %ext0 = zext i32 %arg0 to i64
   %ext1 = zext i32 %arg1 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index f2815915b84259..5bcdf4d0aaf16f 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -99,8 +99,6 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: madak_f32:
@@ -147,8 +145,6 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
@@ -283,8 +279,6 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
 ; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: madak_2_use_f32:
@@ -346,8 +340,6 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
@@ -440,8 +432,6 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, 4.0, v1
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
@@ -478,8 +468,6 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
@@ -582,8 +570,6 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: madak_inline_imm_f32:
@@ -630,8 +616,6 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, 4.0
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
@@ -726,8 +710,6 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, s0, v1
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: s_v_madak_f32:
@@ -770,8 +752,6 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s0, v1, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
@@ -869,8 +849,6 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: v_s_madak_f32:
@@ -917,8 +895,6 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s4, v1, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -988,8 +964,6 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0
 ; GFX11-MAD-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: s_s_madak_f32:
@@ -1021,8 +995,6 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %mul = fmul float %a, %b
   %madak = fadd float %mul, 10.0
@@ -1120,8 +1092,6 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32:
@@ -1169,8 +1139,6 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, |v1|, v2, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
@@ -1278,8 +1246,6 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
 ; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
 ; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32:
@@ -1327,8 +1293,6 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, |v2|, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
@@ -1464,8 +1428,6 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
 ; GFX11-MAD-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-MAD-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-MAD-NEXT:    s_nop 0
-; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-MAD-NEXT:    s_endpgm
 ;
 ; GFX940-FMA-LABEL: madak_constant_bus_violation:
@@ -1533,8 +1495,6 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
 ; GFX11-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-FMA-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FMA-NEXT:    s_nop 0
-; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FMA-NEXT:    s_endpgm
 bb:
   %tmp = icmp eq i32 %arg1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 92536c2078514a..6910004a9ef505 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -84,8 +84,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX11-NEXT:    v_not_b32_e32 v1, v1
 ; GFX11-NEXT:    v_not_b32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
index e929da796de6de..9b413f95abae36 100644
--- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
@@ -170,8 +170,6 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5] offset:1024
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5] offset:1040
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: long_store_chain:
@@ -255,8 +253,6 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[4:5] offset:1024
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[4:5] offset:1040
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   store <4 x i32> zeroinitializer, ptr addrspace(1) %p
   %ptr1 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 05ef2698c1f774..ff39439a2db1af 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -116,8 +116,6 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
@@ -194,8 +192,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -266,8 +262,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
@@ -377,8 +371,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
@@ -490,8 +482,6 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX11-NEXT:    s_min_i32 s2, s2, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
@@ -725,8 +715,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -830,8 +818,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_min_i16 v1, s2, s3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
@@ -1013,8 +999,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; GFX11-NEXT:    v_pk_min_i16 v1, s5, s7
 ; GFX11-NEXT:    v_pk_min_i16 v0, s4, s6
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -1132,8 +1116,6 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
@@ -1273,8 +1255,6 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_min_i16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
@@ -1352,8 +1332,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -1441,8 +1419,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
@@ -1520,8 +1496,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp slt i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
@@ -1599,8 +1573,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
@@ -1718,8 +1690,6 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
@@ -1859,8 +1829,6 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    v_min_u32_e32 v1, v1, v4
 ; GFX11-NEXT:    v_min_u32_e32 v0, v0, v3
 ; GFX11-NEXT:    global_store_b96 v6, v[0:2], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid
@@ -2039,8 +2007,6 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v4, v1, s[4:5] offset:4
 ; GFX11-NEXT:    global_store_b32 v4, v0, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
@@ -2118,8 +2084,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -2237,8 +2201,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
@@ -2363,8 +2325,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_min_u16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
@@ -2442,8 +2402,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
@@ -2581,8 +2539,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %aptr, align 4
   %b = load i32, ptr addrspace(1) %bptr, align 4
@@ -2725,8 +2681,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(1) %aptr, align 2
   %b = load i16, ptr addrspace(1) %bptr, align 2
@@ -2800,8 +2754,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp ult <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
@@ -2974,8 +2926,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp ult <8 x i32> %a, %b
   %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
@@ -3258,8 +3208,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ; GFX11-NEXT:    v_pk_min_u16 v1, s5, s9
 ; GFX11-NEXT:    v_pk_min_u16 v0, s4, s8
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp ult <8 x i16> %a, %b
   %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
@@ -3364,8 +3312,6 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; GFX11-NEXT:    s_min_u32 s2, s2, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
@@ -3473,8 +3419,6 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; GFX11-NEXT:    s_min_i32 s2, s2, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
@@ -3585,8 +3529,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX11-NEXT:    s_min_i32 s2, s2, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
@@ -3697,8 +3639,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp = icmp ult i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
@@ -3807,8 +3747,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp = icmp ule i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
@@ -3917,8 +3855,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp = icmp slt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
@@ -4027,8 +3963,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tmp = icmp sle i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
@@ -4174,8 +4108,6 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_min_i16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
@@ -4325,8 +4257,6 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_min_u16 v1, v1, v2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
index 062955d722068e..c375b16ee3809c 100644
--- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
@@ -22,8 +22,6 @@ define amdgpu_ps void @s_test_minmax_f32(float inreg %a, float inreg %b, float i
 ; SDAG-NEXT:    s_minimum_f32 s0, s0, s2
 ; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_test_minmax_f32:
@@ -36,8 +34,6 @@ define amdgpu_ps void @s_test_minmax_f32(float inreg %a, float inreg %b, float i
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %smax = call float @llvm.maximum.f32(float %a, float %b)
   %sminmax = call float @llvm.minimum.f32(float %smax, float %c)
@@ -115,8 +111,6 @@ define amdgpu_ps void @s_test_minmax_f16(half inreg %a, half inreg %b, half inre
 ; SDAG-NEXT:    s_minimum_f16 s0, s0, s2
 ; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; SDAG-NEXT:    global_store_b16 v0, v1, s[4:5]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_test_minmax_f16:
@@ -129,8 +123,6 @@ define amdgpu_ps void @s_test_minmax_f16(half inreg %a, half inreg %b, half inre
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-NEXT:    global_store_b16 v1, v0, s[6:7]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %smax = call half @llvm.maximum.f16(half %a, half %b)
   %sminmax = call half @llvm.minimum.f16(half %smax, half %c)
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index b5b9997f297c20..73f3d4c037ad51 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -22,8 +22,6 @@ define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %
 ; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; SDAG-NEXT:    s_mov_b32 s4, s3
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_test_minmax_i32:
@@ -34,8 +32,6 @@ define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %
 ; GISEL-NEXT:    s_mov_b32 s7, s4
 ; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
@@ -111,8 +107,6 @@ define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %
 ; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; SDAG-NEXT:    s_mov_b32 s4, s3
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_test_minmax_u32:
@@ -123,8 +117,6 @@ define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %
 ; GISEL-NEXT:    s_mov_b32 s7, s4
 ; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.umin.i32(i32 %smax, i32 %c)
@@ -209,8 +201,6 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg
 ; SDAG-NEXT:    s_mov_b32 s4, s3
 ; SDAG-NEXT:    v_maxmin_f32 v0, s0, s1, v0
 ; SDAG-NEXT:    global_store_b32 v1, v0, s[4:5]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_test_minmax_f32_ieee_false:
@@ -220,8 +210,6 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg
 ; GISEL-NEXT:    s_mov_b32 s7, s4
 ; GISEL-NEXT:    v_maxmin_f32 v0, s0, s1, v0
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %smax = call float @llvm.maxnum.f32(float %a, float %b)
   %sminmax = call float @llvm.minnum.f32(float %smax, float %c)
@@ -303,8 +291,6 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 ; SDAG-NEXT:    s_mov_b32 s4, s3
 ; SDAG-NEXT:    v_maxmin_f16 v0, s0, s1, v0
 ; SDAG-NEXT:    global_store_b16 v1, v0, s[4:5]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: s_test_minmax_f16_ieee_false:
@@ -314,8 +300,6 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 ; GISEL-NEXT:    s_mov_b32 s7, s4
 ; GISEL-NEXT:    v_maxmin_f16 v0, s0, s1, v0
 ; GISEL-NEXT:    global_store_b16 v1, v0, s[6:7]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
   %smax = call half @llvm.maxnum.f16(half %a, half %b)
   %sminmax = call half @llvm.minnum.f16(half %smax, half %c)
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 30a40e6af85389..03de142a41b4c7 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -103,8 +103,6 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: test_mul_v2i32:
@@ -124,8 +122,6 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: test_mul_v2i32:
@@ -265,8 +261,6 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul_v4i32:
@@ -290,8 +284,6 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v5
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul_v4i32:
@@ -394,8 +386,6 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_trunc_i64_mul_to_i32:
@@ -409,8 +399,6 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
 ; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_trunc_i64_mul_to_i32:
@@ -541,8 +529,6 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_trunc_i64_mul_to_i32:
@@ -566,8 +552,6 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_trunc_i64_mul_to_i32:
@@ -670,8 +654,6 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mul64_sext_c:
@@ -685,8 +667,6 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: mul64_sext_c:
@@ -778,8 +758,6 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mul64_zext_c:
@@ -792,8 +770,6 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: mul64_zext_c:
@@ -911,8 +887,6 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX11-NEXT:    v_mul_hi_i32 v1, 0x50, v0
 ; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul64_sext_c:
@@ -932,8 +906,6 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    v_mul_hi_i32 v1, 0x50, v0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul64_sext_c:
@@ -1058,8 +1030,6 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX11-NEXT:    v_mul_hi_u32 v1, 0x50, v0
 ; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul64_zext_c:
@@ -1079,8 +1049,6 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    v_mul_hi_u32 v1, 0x50, v0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul64_zext_c:
@@ -1202,8 +1170,6 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    v_mul_hi_i32 v1, v0, 9
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 9
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul64_sext_inline_imm:
@@ -1223,8 +1189,6 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ; GFX12-NEXT:    v_mul_hi_i32 v1, 9, v0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, 9, v0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul64_sext_inline_imm:
@@ -1320,8 +1284,6 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_i32:
@@ -1337,8 +1299,6 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_mul_i32:
@@ -1446,8 +1406,6 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul_i32:
@@ -1466,8 +1424,6 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul_i32:
@@ -1567,8 +1523,6 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_i1:
@@ -1585,8 +1539,6 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    buffer_store_b8 v0, off, s[0:3], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_mul_i1:
@@ -1724,8 +1676,6 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul_i1:
@@ -1748,8 +1698,6 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX12-NEXT:    buffer_store_b8 v0, off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul_i1:
@@ -1894,8 +1842,6 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
 ; GFX11-NEXT:    s_mov_b32 s0, s4
 ; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_i64:
@@ -1909,8 +1855,6 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_mul_i64:
@@ -2070,8 +2014,6 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v1
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul_i64:
@@ -2101,8 +2043,6 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, v3, v1
 ; GFX12-NEXT:    v_add_nc_u32_e32 v1, v1, v4
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul_i64:
@@ -2310,8 +2250,6 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mul32_in_branch:
@@ -2347,8 +2285,6 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: mul32_in_branch:
@@ -2574,8 +2510,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mul64_in_branch:
@@ -2604,8 +2538,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: mul64_in_branch:
@@ -2909,8 +2841,6 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_i128:
@@ -2954,8 +2884,6 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_mul_i128:
@@ -3202,8 +3130,6 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v13
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
 ; GFX11-NEXT:    global_store_b128 v15, v[8:11], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_mul_i128:
@@ -3243,8 +3169,6 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX12-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    global_store_b128 v13, v[8:11], s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul_i128:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 548c196df1da54..3a9cf9678d8461 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -1884,8 +1884,6 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_1:
@@ -1896,8 +1894,6 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -1934,8 +1930,6 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_11bit_max:
@@ -1946,8 +1940,6 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -1984,8 +1976,6 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_12bit_max:
@@ -1996,8 +1986,6 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2034,8 +2022,6 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_13bit_max:
@@ -2046,8 +2032,6 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2084,8 +2068,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max:
@@ -2096,8 +2078,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2137,8 +2117,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max:
@@ -2149,8 +2127,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max:
@@ -2206,8 +2182,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max:
@@ -2218,8 +2192,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
@@ -2243,8 +2215,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2281,8 +2251,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max:
@@ -2293,8 +2261,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2331,8 +2297,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max:
@@ -2343,8 +2307,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2381,8 +2343,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max:
@@ -2393,8 +2353,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2434,8 +2392,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
@@ -2446,8 +2402,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
@@ -2503,8 +2457,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
@@ -2515,8 +2467,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
@@ -2540,8 +2490,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2586,8 +2534,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
@@ -2598,8 +2544,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
@@ -2623,8 +2567,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2670,8 +2612,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
@@ -2685,8 +2625,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
@@ -2710,8 +2648,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
@@ -2724,8 +2660,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2771,8 +2705,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
@@ -2786,8 +2718,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
@@ -2811,8 +2741,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
@@ -2825,8 +2753,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2872,8 +2798,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
@@ -2887,8 +2811,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
@@ -2912,8 +2834,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
@@ -2926,8 +2846,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -2973,8 +2891,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
@@ -2988,8 +2904,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
@@ -3013,8 +2927,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
@@ -3027,8 +2939,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3074,8 +2984,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
@@ -3089,8 +2997,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
@@ -3114,8 +3020,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
@@ -3128,8 +3032,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3175,8 +3077,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
@@ -3190,8 +3090,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
@@ -3215,8 +3113,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
@@ -3229,8 +3125,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3274,8 +3168,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
@@ -3288,8 +3180,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
@@ -3303,8 +3193,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3348,8 +3236,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
@@ -3362,8 +3248,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
@@ -3377,8 +3261,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3422,8 +3304,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
@@ -3436,8 +3316,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
@@ -3451,8 +3329,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3496,8 +3372,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
@@ -3510,8 +3384,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
@@ -3525,8 +3397,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3570,8 +3440,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
@@ -3584,8 +3452,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
@@ -3599,8 +3465,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
@@ -3644,8 +3508,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
@@ -3658,8 +3520,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
@@ -3673,8 +3533,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index df15f98ae27ff6..dd0b96fbb49591 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -53,8 +53,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0.5, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
@@ -69,8 +67,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
 ; GFX12-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX12-NEXT:    v_mul_f32_e32 v1, 0.5, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -131,8 +127,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
@@ -147,8 +141,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
 ; GFX12-NEXT:    v_add_f64_e32 v[0:1], 1.0, v[0:1]
 ; GFX12-NEXT:    v_mul_f64_e32 v[0:1], 0.5, v[0:1]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
@@ -209,8 +201,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0.5, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz:
@@ -225,8 +215,6 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX12-NEXT:    v_mul_f32_e32 v1, 0.5, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
@@ -287,8 +275,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz:
@@ -303,8 +289,6 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_add_f64_e32 v[0:1], 1.0, v[0:1]
 ; GFX12-NEXT:    v_mul_f64_e32 v[0:1], 0.5, v[0:1]
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
@@ -340,8 +324,6 @@ define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
@@ -373,8 +355,6 @@ define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_div2_f64_signed_zeros:
@@ -383,8 +363,6 @@ define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mul_f64_e32 v[0:1], 0.5, v[0:1]
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %add = fadd double %a, 1.0
   %div2 = fmul double %add, 0.5
@@ -411,8 +389,6 @@ define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
 ; GFX11PLUS:       ; %bb.0:
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
@@ -439,16 +415,12 @@ define amdgpu_ps void @v_omod_div2_f64(double %a) #5 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 div:2
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_div2_f64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_add_f64_e64 v[0:1], v[0:1], 1.0 div:2
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %add = fadd nsz double %a, 1.0
   %div2 = fmul nsz double %add, 0.5
@@ -475,8 +447,6 @@ define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
 ; GFX11PLUS:       ; %bb.0:
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 2.0
@@ -503,16 +473,12 @@ define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_med3_f32 v0, v0, v1, v2 mul:2
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_mul2_med3:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, v1, v2 mul:2
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
   %div2 = fmul float %fmed3, 2.0
@@ -539,16 +505,12 @@ define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:2
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_mul2_f64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_add_f64_e64 v[0:1], v[0:1], 1.0 mul:2
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %add = fadd nsz double %a, 1.0
   %div2 = fmul nsz double %add, 2.0
@@ -575,8 +537,6 @@ define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
 ; GFX11PLUS:       ; %bb.0:
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 4.0
@@ -603,16 +563,12 @@ define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:4
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_mul4_f64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    v_add_f64_e64 v[0:1], v[0:1], 1.0 mul:4
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %add = fadd nsz double %a, 1.0
   %div2 = fmul nsz double %add, 4.0
@@ -650,8 +606,6 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_mul4_multi_use_f32:
@@ -663,8 +617,6 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 4.0
@@ -692,8 +644,6 @@ define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
 ; GFX11PLUS:       ; %bb.0:
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
@@ -722,8 +672,6 @@ define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
 ; GFX11PLUS:       ; %bb.0:
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp div:2
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
@@ -758,8 +706,6 @@ define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
@@ -792,8 +738,6 @@ define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_mul_f32_e64 v0, |v0|, 0.5
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %abs.add = call float @llvm.fabs.f32(float %add)
@@ -821,8 +765,6 @@ define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
 ; GFX11PLUS:       ; %bb.0:
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, v0, v0 clamp
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, %a
   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
@@ -854,8 +796,6 @@ define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_add_clamp_self_f32:
@@ -864,8 +804,6 @@ define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
@@ -897,8 +835,6 @@ define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, |v0|, |v0|
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %x = fadd float %a, 1.0
   %abs.x = call float @llvm.fabs.f32(float %x)
@@ -930,8 +866,6 @@ define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, |v0|, v0
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %x = fadd float %a, 1.0
   %abs.x = call float @llvm.fabs.f32(float %x)
@@ -963,8 +897,6 @@ define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v0, v0, |v0|
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %x = fadd float %a, 1.0
   %abs.x = call float @llvm.fabs.f32(float %x)
@@ -997,8 +929,6 @@ define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2.0 = fmul float %add, 0.5
@@ -1031,8 +961,6 @@ define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
@@ -1064,8 +992,6 @@ define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_div2_f64_denormals:
@@ -1074,8 +1000,6 @@ define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mul_f64_e32 v[0:1], 0.5, v[0:1]
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %add = fadd double %a, 1.0
   %div2 = fmul double %add, 0.5
@@ -1107,8 +1031,6 @@ define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %mul2 = fadd float %add, %add
@@ -1140,8 +1062,6 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: v_omod_mul2_f64_denormals:
@@ -1150,8 +1070,6 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %add = fadd double %a, 1.0
   %mul2 = fadd double %add, %add
@@ -1185,8 +1103,6 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_mul_f16_e32 v0, 0.5, v0
 ; GFX11PLUS-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %div2 = fmul half %add, 0.5
@@ -1220,8 +1136,6 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
 ; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11PLUS-NEXT:    v_add_f16_e32 v0, v0, v0
 ; GFX11PLUS-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %mul2 = fadd half %add, %add
@@ -1251,8 +1165,6 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
 ; GFX11PLUS:       ; %bb.0:
 ; GFX11PLUS-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
 ; GFX11PLUS-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %div2 = fmul half %add, 0.5
@@ -1284,8 +1196,6 @@ define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
 ; GFX11PLUS-NEXT:    v_add_f32_e64 v1, v1, v0 mul:2
 ; GFX11PLUS-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11PLUS-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_nop 0
-; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11PLUS-NEXT:    s_endpgm
   %mul = fmul float %a, %a
   %add = fadd float %mul, %b
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir
index e50ee11fef51c0..80cb39830f1af4 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir
@@ -30,11 +30,11 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: test
-    ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec
     ; CHECK-NEXT: S_NOP 0, debug-location !8
     ; CHECK-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0, debug-location !8
     ; CHECK-NEXT: S_ENDPGM 0, debug-location !8
-    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec
     S_ENDPGM 0, debug-location !8
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 25804123eff181..a9cb169d1a0966 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -24,6 +24,7 @@
   define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
   define amdgpu_cs void @with_calls() { ret void }
   define fastcc void @with_tail_calls() { ret void }
+  define amdgpu_cs void @waveslot_limited() { ret void }
 ...
 
 ---
@@ -36,13 +37,13 @@ body:             |
     ; OPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
+    ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: tbuffer_store1
     ; NOOPT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -55,13 +56,13 @@ body:             |
     ; OPT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
+    ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: tbuffer_store2
     ; NOOPT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -72,9 +73,9 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: flat_store
     ; CHECK: FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -88,15 +89,15 @@ body:             |
     ; OPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
+    ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: global_store
     ; NOOPT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     ; NOOPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     S_WAITCNT_VSCNT undef $sgpr_null, 0
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -109,13 +110,13 @@ body:             |
     ; OPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
+    ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: buffer_store_format
     ; NOOPT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -128,11 +129,11 @@ body:             |
     ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
     ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
     ; CHECK-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     renamable $vgpr0 = IMPLICIT_DEF
     renamable $vgpr1 = IMPLICIT_DEF
     DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 
 ...
 ---
@@ -150,17 +151,17 @@ body:             |
     ; OPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
+    ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: global_store_dword
     ; NOOPT: liveins: $vgpr0, $sgpr0_sgpr1
     ; NOOPT-NEXT: {{  $}}
     ; NOOPT-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
     ; NOOPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
     GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -185,7 +186,7 @@ body:             |
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit $vgpr97
   bb.0:
     successors: %bb.1
 
@@ -201,7 +202,7 @@ body:             |
     S_BRANCH %bb.2
 
   bb.2:
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 
 ...
 
@@ -231,7 +232,7 @@ body:             |
   ; OPT-NEXT: bb.2:
   ; OPT-NEXT:   S_NOP 0
   ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
+  ; OPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
   ;
   ; NOOPT-LABEL: name: multiple_basic_blocks2
   ; NOOPT: bb.0:
@@ -249,7 +250,7 @@ body:             |
   ; NOOPT-NEXT:   S_BRANCH %bb.2
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT: bb.2:
-  ; NOOPT-NEXT:   S_ENDPGM 0
+  ; NOOPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
   bb.0:
     successors: %bb.2
 
@@ -265,7 +266,7 @@ body:             |
     S_BRANCH %bb.2
 
   bb.2:
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 
@@ -303,7 +304,7 @@ body:             |
   ; OPT-NEXT: bb.4:
   ; OPT-NEXT:   S_NOP 0
   ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
+  ; OPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
   ;
   ; NOOPT-LABEL: name: multiple_basic_blocks3
   ; NOOPT: bb.0:
@@ -331,7 +332,7 @@ body:             |
   ; NOOPT-NEXT:   S_BRANCH %bb.4
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT: bb.4:
-  ; NOOPT-NEXT:   S_ENDPGM 0
+  ; NOOPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
   bb.0:
     successors: %bb.2
 
@@ -357,7 +358,7 @@ body:             |
     S_BRANCH %bb.4
 
   bb.4:
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -380,7 +381,7 @@ body:             |
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit $vgpr97
   bb.0:
     successors: %bb.1
 
@@ -395,7 +396,7 @@ body:             |
     S_BRANCH %bb.2
 
   bb.2:
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -422,7 +423,7 @@ body:             |
   ; OPT-NEXT: bb.2:
   ; OPT-NEXT:   S_NOP 0
   ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
+  ; OPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
   ;
   ; NOOPT-LABEL: name: recursive_loop_vmem
   ; NOOPT: bb.0:
@@ -441,7 +442,7 @@ body:             |
   ; NOOPT-NEXT:   S_BRANCH %bb.2
   ; NOOPT-NEXT: {{  $}}
   ; NOOPT-NEXT: bb.2:
-  ; NOOPT-NEXT:   S_ENDPGM 0
+  ; NOOPT-NEXT:   S_ENDPGM 0, implicit $vgpr97
   bb.0:
     successors: %bb.1
 
@@ -457,7 +458,7 @@ body:             |
     S_BRANCH %bb.2
 
   bb.2:
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -470,13 +471,13 @@ body:             |
     ; OPT: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
+    ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: image_store
     ; NOOPT: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
   IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
-  S_ENDPGM 0
+  S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -488,10 +489,10 @@ body:             |
     ; CHECK-LABEL: name: scratch_store
     ; CHECK: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
     ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
     SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -504,13 +505,13 @@ body:             |
     ; OPT: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
     ; OPT-NEXT: S_NOP 0
     ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
+    ; OPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     ;
     ; NOOPT-LABEL: name: buffer_atomic
     ; NOOPT: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; NOOPT-NEXT: S_ENDPGM 0, implicit $vgpr97
     BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -521,9 +522,9 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: flat_atomic
     ; CHECK: renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 
@@ -535,9 +536,9 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: global_atomic
     ; CHECK: renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -548,9 +549,9 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: image_atomic
     ; CHECK: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
@@ -562,38 +563,59 @@ body:             |
     ; CHECK-LABEL: name: global_store_optnone
     ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     ; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; CHECK-NEXT: S_ENDPGM 0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     S_WAITCNT_VSCNT undef $sgpr_null, 0
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
+# Make sure we don't send DEALLOC_VGPRS after a call, since there might be
+# scratch stores still in progress.
 name:            with_calls
 frameInfo:
   hasCalls:        true
 body:             |
   bb.0:
-    ; Make sure we don't send DEALLOC_VGPRS after a call, since there might be
-    ; scratch stores still in progress.
     ; CHECK-LABEL: name: with_calls
-    ; CHECK-NOT: S_SENDMSG 3
-    ; CHECK: S_ENDPGM 0
+    ; CHECK: S_WAITCNT 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr97
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu
-    S_ENDPGM 0
+    S_ENDPGM 0, implicit $vgpr97
 ...
 
 ---
+# Make sure we don't send DEALLOC_VGPRS when there's a tail call, since the
+# only valid action after DEALLOC_VGPRS is to terminate the wave.
 name:            with_tail_calls
 frameInfo:
   hasCalls:        true
 body:             |
   bb.0:
-    ; Make sure we don't send DEALLOC_VGPRS when there's a tail call, since the
-    ; only valid action after DEALLOC_VGPRS is to terminate the wave.
     ; CHECK-LABEL: name: with_tail_calls
-    ; CHECK-NOT: S_SENDMSG 3
-    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    ; CHECK: S_WAITCNT 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec
+    ; CHECK-NEXT: SI_TCRETURN undef renamable $sgpr4_sgpr5, @with_tail_calls, 0, csr_amdgpu
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr97, 0, 4, implicit $exec
     SI_TCRETURN undef renamable $sgpr4_sgpr5, @with_tail_calls, 0, csr_amdgpu
 ...
+
+---
+# Do not deallocate VGPRs if kernel uses a small enough number of VGPRs
+# making it likely waveslot limited and not VGPR limited. For gfx11 that
+# is 96 registers.
+name:            waveslot_limited
+frameInfo:
+  hasCalls:        true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: waveslot_limited
+    ; CHECK: S_WAITCNT 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr96, 0, 4, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr96, 0, 4, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index f692584ef92be2..d1e785f8daa0af 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -64,8 +64,6 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_alignbit_b32 v1, s2, s2, s3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %x, %y
@@ -151,8 +149,6 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s5, s2
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s4, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = shl <2 x i32> %x, %y
@@ -264,8 +260,6 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s5, s3
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s4, s2
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %0 = shl <4 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index a368aa1055b282..2188a9864faa0d 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -57,8 +57,6 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, s2, s2, s3
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tmp0 = sub i32 32, %y
@@ -132,8 +130,6 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s5, s7
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s4, s6
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
@@ -221,8 +217,6 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
 ; GFX11-NEXT:    v_alignbit_b32 v1, s5, s5, s9
 ; GFX11-NEXT:    v_alignbit_b32 v0, s4, s4, s8
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 57c936d4689ebb..d8deb810971106 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -113,8 +113,6 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s2, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -220,8 +218,6 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
@@ -336,8 +332,6 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %aptr, align 4
   %b = load i32, ptr addrspace(1) %bptr, align 4
@@ -451,8 +445,6 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -576,8 +568,6 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[4:5]
 ; GFX11-NEXT:    global_store_b8 v6, v0, s[6:7]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load i64, ptr addrspace(1) %aptr, align 4
   %b = load i64, ptr addrspace(1) %bptr, align 4
@@ -716,8 +706,6 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v5, v[3:4], s[0:1]
 ; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
index dafd23af9d658d..ca508eb400170d 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
@@ -11,8 +11,6 @@ define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_olt:
@@ -24,8 +22,6 @@ define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp olt float %a, %b
@@ -43,8 +39,6 @@ define amdgpu_vs void @f32_oeq(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_oeq:
@@ -56,8 +50,6 @@ define amdgpu_vs void @f32_oeq(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp oeq float %a, %b
@@ -75,8 +67,6 @@ define amdgpu_vs void @f32_ole(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_ole:
@@ -88,8 +78,6 @@ define amdgpu_vs void @f32_ole(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ole float %a, %b
@@ -107,8 +95,6 @@ define amdgpu_vs void @f32_ogt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_ogt:
@@ -120,8 +106,6 @@ define amdgpu_vs void @f32_ogt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ogt float %a, %b
@@ -139,8 +123,6 @@ define amdgpu_vs void @f32_one(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_one:
@@ -152,8 +134,6 @@ define amdgpu_vs void @f32_one(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp one float %a, %b
@@ -171,8 +151,6 @@ define amdgpu_vs void @f32_oge(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_oge:
@@ -184,8 +162,6 @@ define amdgpu_vs void @f32_oge(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp oge float %a, %b
@@ -203,8 +179,6 @@ define amdgpu_vs void @f32_ord(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_ord:
@@ -216,8 +190,6 @@ define amdgpu_vs void @f32_ord(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ord float %a, %b
@@ -235,8 +207,6 @@ define amdgpu_vs void @f32_uno(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_uno:
@@ -248,8 +218,6 @@ define amdgpu_vs void @f32_uno(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp uno float %a, %b
@@ -267,8 +235,6 @@ define amdgpu_vs void @f32_ult(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_ult:
@@ -280,8 +246,6 @@ define amdgpu_vs void @f32_ult(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ult float %a, %b
@@ -299,8 +263,6 @@ define amdgpu_vs void @f32_ueq(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_ueq:
@@ -312,8 +274,6 @@ define amdgpu_vs void @f32_ueq(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ueq float %a, %b
@@ -331,8 +291,6 @@ define amdgpu_vs void @f32_ule(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_ule:
@@ -344,8 +302,6 @@ define amdgpu_vs void @f32_ule(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ule float %a, %b
@@ -363,8 +319,6 @@ define amdgpu_vs void @f32_ugt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_ugt:
@@ -376,8 +330,6 @@ define amdgpu_vs void @f32_ugt(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ugt float %a, %b
@@ -395,8 +347,6 @@ define amdgpu_vs void @f32_une(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_une:
@@ -408,8 +358,6 @@ define amdgpu_vs void @f32_une(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp une float %a, %b
@@ -427,8 +375,6 @@ define amdgpu_vs void @f32_uge(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f32_uge:
@@ -440,8 +386,6 @@ define amdgpu_vs void @f32_uge(ptr addrspace(1) inreg %out, float inreg %a, floa
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp uge float %a, %b
@@ -459,8 +403,6 @@ define amdgpu_vs void @f16_olt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_olt:
@@ -472,8 +414,6 @@ define amdgpu_vs void @f16_olt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp olt half %a, %b
@@ -491,8 +431,6 @@ define amdgpu_vs void @f16_oeq(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_oeq:
@@ -504,8 +442,6 @@ define amdgpu_vs void @f16_oeq(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp oeq half %a, %b
@@ -523,8 +459,6 @@ define amdgpu_vs void @f16_ole(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_ole:
@@ -536,8 +470,6 @@ define amdgpu_vs void @f16_ole(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ole half %a, %b
@@ -555,8 +487,6 @@ define amdgpu_vs void @f16_ogt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_ogt:
@@ -568,8 +498,6 @@ define amdgpu_vs void @f16_ogt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ogt half %a, %b
@@ -587,8 +515,6 @@ define amdgpu_vs void @f16_one(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_one:
@@ -600,8 +526,6 @@ define amdgpu_vs void @f16_one(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp one half %a, %b
@@ -619,8 +543,6 @@ define amdgpu_vs void @f16_oge(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_oge:
@@ -632,8 +554,6 @@ define amdgpu_vs void @f16_oge(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp oge half %a, %b
@@ -651,8 +571,6 @@ define amdgpu_vs void @f16_ord(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_ord:
@@ -664,8 +582,6 @@ define amdgpu_vs void @f16_ord(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ord half %a, %b
@@ -683,8 +599,6 @@ define amdgpu_vs void @f16_uno(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_uno:
@@ -696,8 +610,6 @@ define amdgpu_vs void @f16_uno(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp uno half %a, %b
@@ -715,8 +627,6 @@ define amdgpu_vs void @f16_ult(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_ult:
@@ -728,8 +638,6 @@ define amdgpu_vs void @f16_ult(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ult half %a, %b
@@ -747,8 +655,6 @@ define amdgpu_vs void @f16_ueq(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_ueq:
@@ -760,8 +666,6 @@ define amdgpu_vs void @f16_ueq(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ueq half %a, %b
@@ -779,8 +683,6 @@ define amdgpu_vs void @f16_ule(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_ule:
@@ -792,8 +694,6 @@ define amdgpu_vs void @f16_ule(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ule half %a, %b
@@ -811,8 +711,6 @@ define amdgpu_vs void @f16_ugt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_ugt:
@@ -824,8 +722,6 @@ define amdgpu_vs void @f16_ugt(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp ugt half %a, %b
@@ -843,8 +739,6 @@ define amdgpu_vs void @f16_une(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_une:
@@ -856,8 +750,6 @@ define amdgpu_vs void @f16_une(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp une half %a, %b
@@ -875,8 +767,6 @@ define amdgpu_vs void @f16_uge(ptr addrspace(1) inreg %out, half inreg %a, half
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
 ; SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: f16_uge:
@@ -888,8 +778,6 @@ define amdgpu_vs void @f16_uge(ptr addrspace(1) inreg %out, half inreg %a, half
 ; GISEL-NEXT:    s_bfe_i32 s2, s2, 0x10000
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
 entry:
   %0 = fcmp uge half %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index cc109595d8d703..84ca5dd0c18633 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -116,8 +116,6 @@ define amdgpu_kernel void @select_f16(
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[12:15], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 
     ptr addrspace(1) %r,
@@ -232,8 +230,6 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b,
@@ -345,8 +341,6 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -459,8 +453,6 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -573,8 +565,6 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -732,8 +722,6 @@ define amdgpu_kernel void @select_v2f16(
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -875,8 +863,6 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b,
@@ -1016,8 +1002,6 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1159,8 +1143,6 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1302,8 +1284,6 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 9b9f03ff74aa3f..52dbd31b2c646c 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -79,8 +79,6 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = shl <2 x i16> %lhs, %rhs
   store <2 x i16> %result, ptr addrspace(1) %out
@@ -162,8 +160,6 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -261,8 +257,6 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v1, s0, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -358,8 +352,6 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, s0
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -446,8 +438,6 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -530,8 +520,6 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -631,8 +619,6 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -730,8 +716,6 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 88e2bb772a2d33..6ac04d8bc42bba 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -115,8 +115,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -272,8 +270,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -390,8 +386,6 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 64, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -527,8 +521,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_i32_x_sub_65:
@@ -542,8 +534,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0x41, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -657,8 +647,6 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x41, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -794,8 +782,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 16, v1
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16:
@@ -809,8 +795,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, -16, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -924,8 +908,6 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, -16, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1061,8 +1043,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v1, 17, v1
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17:
@@ -1076,8 +1056,6 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_subrev_nc_u32_e32 v1, 0xffffffef, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1191,8 +1169,6 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0xffffffef, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1361,8 +1337,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u16 v1, v1, 64
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1488,8 +1462,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1646,8 +1618,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1779,8 +1749,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1922,8 +1890,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x400007
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2065,8 +2031,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x7b0040
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2192,8 +2156,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 7
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2317,8 +2279,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2454,8 +2414,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0xc4000000
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2591,8 +2549,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v1, v1, 0x44000000
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2722,8 +2678,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2846,8 +2800,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2972,8 +2924,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -3103,8 +3053,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -3227,8 +3175,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -3353,8 +3299,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 16
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -3507,8 +3451,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
@@ -3522,8 +3464,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xc400c400, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -3676,8 +3616,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
@@ -3691,8 +3629,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0x44004400, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -3821,8 +3757,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -3951,8 +3885,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4074,8 +4006,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4217,8 +4147,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
 ; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_nop 0
-; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
@@ -4232,8 +4160,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffffffe0, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_nop 0
-; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 3b35b2d3d9865f..ab00d7d33bb9c7 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -76,8 +76,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = sitofp i64 %in to half
   store half %result, ptr addrspace(1) %out
@@ -173,8 +171,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -252,8 +248,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = sitofp i64 %in to float
   store float %result, ptr addrspace(1) %out
@@ -346,8 +340,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -460,8 +452,6 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
 ; GFX11-NEXT:    v_ldexp_f32 v1, v0, s2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, s3
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = sitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, ptr addrspace(1) %out
@@ -671,8 +661,6 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
 ; GFX11-NEXT:    v_ldexp_f32 v0, v5, v4
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
@@ -797,8 +785,6 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = sitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, ptr addrspace(1) %out
@@ -1031,8 +1017,6 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v0, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v4
 ; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 9169433cdca56f..8d34c7bb14ed49 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -58,8 +58,6 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: sitofp_i16_to_f16:
@@ -78,8 +76,6 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -147,8 +143,6 @@ define amdgpu_kernel void @sitofp_i32_to_f16(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: sitofp_i32_to_f16:
@@ -169,8 +163,6 @@ define amdgpu_kernel void @sitofp_i32_to_f16(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -252,8 +244,6 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16:
@@ -276,8 +266,6 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -359,8 +347,6 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16:
@@ -385,8 +371,6 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -480,8 +464,6 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16:
@@ -512,8 +494,6 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
   %a = load float, ptr addrspace(1) %in0
   %b = load float, ptr addrspace(1) %in1
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 8e0a83671a1837..fdcb6c941e16ec 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -688,8 +688,6 @@ define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB8_2
 ; GFX11-NEXT:  ; %bb.1: ; %exit
 ; GFX11-NEXT:    global_store_b32 v[0:1], v9, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB8_2: ; %bb
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -721,8 +719,6 @@ define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
 ; GFX11-NEXT:    v_mov_b32_e64 v9, -2
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    global_store_b32 v[0:1], v9, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB8_4:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
@@ -1105,8 +1101,6 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB10_4:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index df12e32d971a20..85dd5581f287b2 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -49,8 +49,6 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %result = sub i32 %a, %b
   store i32 %result, ptr addrspace(1) %out
@@ -101,8 +99,6 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %result = sub i32 1234, %a
   store i32 %result, ptr addrspace(1) %out
@@ -162,8 +158,6 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
   %a = load i32, ptr addrspace(1) %in
@@ -226,8 +220,6 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0x7b, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in
   %result = sub i32 123, %a
@@ -292,8 +284,6 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v1, v3
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v0, v2
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
   %a = load <2 x i32>, ptr addrspace(1) %in
@@ -377,8 +367,6 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v0, v4, v0
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
   %a = load <4 x i32>, ptr addrspace(1) %in
@@ -458,8 +446,6 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_sub_nc_u16 v0, v1, v0
 ; GFX12-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
@@ -539,8 +525,6 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_sub_i16 v0, v0, v1
 ; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
@@ -632,8 +616,6 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX12-NEXT:    v_pk_sub_i16 v1, v1, v3
 ; GFX12-NEXT:    v_pk_sub_i16 v0, v0, v2
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
@@ -697,8 +679,6 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %result = sub i64 %a, %b
   store i64 %result, ptr addrspace(1) %out, align 8
@@ -783,8 +763,6 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
 ; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
@@ -882,8 +860,6 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
 ; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
@@ -1022,8 +998,6 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[4:5] offset:16
 ; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[4:5]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 5a821db6ff0408..18a94f75b6d948 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -81,8 +81,6 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -164,8 +162,6 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, s2, s0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load <2 x i16>, ptr addrspace(4) %in0
   %b = load <2 x i16>, ptr addrspace(4) %in1
@@ -203,8 +199,6 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %a = load <2 x i16>, ptr addrspace(4) %in0
   %add = sub <2 x i16> %a, %a
@@ -268,8 +262,6 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = sub <2 x i16> %a, %b
   store <2 x i16> %add, ptr addrspace(1) %out
@@ -337,8 +329,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x1c8007b
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -411,8 +401,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xfc21fcb3
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -483,8 +471,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -554,8 +540,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 32
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -626,8 +610,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 1.0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -721,8 +703,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
@@ -826,8 +806,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_alignbit_b32 v2, 0, v0, 16
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
@@ -926,8 +904,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
@@ -1036,8 +1012,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 0daa6860072616..b8f0d7617167e0 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -186,8 +186,6 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX1100-NEXT:    v_mov_b32_e32 v1, 3
 ; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
-; HSA-TRAP-GFX1100-NEXT:    s_nop 0
-; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; HSA-TRAP-GFX1100-NEXT:    s_endpgm
 ; HSA-TRAP-GFX1100-NEXT:  .LBB1_2: ; %trap
 ; HSA-TRAP-GFX1100-NEXT:    s_trap 2
@@ -331,8 +329,6 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
 ; HSA-TRAP-GFX1100-NEXT:  ; %bb.1:
 ; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
 ; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
-; HSA-TRAP-GFX1100-NEXT:    s_nop 0
-; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; HSA-TRAP-GFX1100-NEXT:    s_endpgm
 ; HSA-TRAP-GFX1100-NEXT:  .LBB2_2:
 ; HSA-TRAP-GFX1100-NEXT:    s_trap 2
@@ -465,8 +461,6 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX1100-NEXT:    s_trap 3
 ; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
 ; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
-; HSA-TRAP-GFX1100-NEXT:    s_nop 0
-; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; HSA-TRAP-GFX1100-NEXT:    s_endpgm
 ;
 ; HSA-TRAP-GFX1100-O0-LABEL: debugtrap:
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 3d0fc4e6281a6b..764f99fb6833fe 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -64,8 +64,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = uitofp i64 %in to half
   store half %result, ptr addrspace(1) %out
@@ -147,8 +145,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -214,8 +210,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = uitofp i64 %in to float
   store float %result, ptr addrspace(1) %out
@@ -294,8 +288,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -384,8 +376,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
 ; GFX11-NEXT:    v_ldexp_f32 v1, v0, s2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, s3
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = uitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, ptr addrspace(1) %out
@@ -546,8 +536,6 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
 ; GFX11-NEXT:    v_ldexp_f32 v0, v5, v4
 ; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
@@ -648,8 +636,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %result = uitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, ptr addrspace(1) %out
@@ -833,8 +819,6 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v0, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v4
 ; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index c4268c15d9db66..686dba7e53e62d 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -58,8 +58,6 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: uitofp_i16_to_f16:
@@ -78,8 +76,6 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -147,8 +143,6 @@ define amdgpu_kernel void @uitofp_i32_to_f16(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: uitofp_i32_to_f16:
@@ -169,8 +163,6 @@ define amdgpu_kernel void @uitofp_i32_to_f16(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -252,8 +244,6 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16:
@@ -276,8 +266,6 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -359,8 +347,6 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16:
@@ -385,8 +371,6 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
@@ -480,8 +464,6 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-TRUE16-NEXT:    s_nop 0
-; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16:
@@ -512,8 +494,6 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT:    s_nop 0
-; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-FAKE16-NEXT:    s_endpgm
   %a = load float, ptr addrspace(1) %in0
   %b = load float, ptr addrspace(1) %in1
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index d1bf5ecb569849..e40f1e89afd28c 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -89,8 +89,6 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
@@ -156,8 +154,6 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
@@ -223,8 +219,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -290,8 +284,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s4, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -357,8 +349,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s1, s[4:5]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -424,8 +414,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s4, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s4, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -509,8 +497,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -596,8 +582,6 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -678,8 +662,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s0, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -770,8 +752,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -864,8 +844,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -962,8 +940,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1073,8 +1049,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1184,8 +1158,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1297,8 +1269,6 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1406,8 +1376,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1512,8 +1480,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1616,8 +1582,6 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1711,8 +1675,6 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1822,8 +1784,6 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1931,8 +1891,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
@@ -2024,8 +1982,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
@@ -2131,8 +2087,6 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index 472a443cf6dde7..691009e9c58d1b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -75,8 +75,6 @@ define amdgpu_kernel void @madak_f16(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e32 v0, 0x4900, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -201,8 +199,6 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; GFX11-NEXT:    v_add_f16_e32 v0, 0x4900, v0
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[12:15], 0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r0,
     ptr addrspace(1) %r1,
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index ee99fcc5863340..3cfc3dcd0efdb6 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -126,8 +126,6 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ; SDAG-GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SDAG-GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; SDAG-GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: basic_smax_smin_sgpr:
@@ -193,8 +191,6 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ; GISEL-GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
   %src0 = trunc i32 %src0ext to i16
   %src1 = trunc i32 %src1ext to i16
@@ -451,8 +447,6 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
 ; SDAG-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; SDAG-GFX11-NEXT:    s_nop 0
-; SDAG-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; SDAG-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: vec_smax_smin_sgpr:
@@ -526,8 +520,6 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
 ; GISEL-GFX11-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GISEL-GFX11-NEXT:    s_nop 0
-; GISEL-GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-GFX11-NEXT:    s_endpgm
   %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> <i16 0, i16 0>)
   %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> <i16 255, i16 255>)
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 2797c5b7988810..bdde260ff8bd5e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1639,8 +1639,6 @@ define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
 ; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1751,8 +1749,6 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16
   %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -4496,8 +4492,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 4efa1e9353ab3a..698be7b2d1ecf0 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -47,8 +47,6 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
   %tid = load volatile i32, ptr addrspace(1) undef
   %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
@@ -218,8 +216,6 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
 .entry:
   %tid = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
index 96fa2a45a2ddf0..8f2ade7ac1a0e2 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %r
 ; GFX12-LABEL: intrinsic_store_system_scope:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
   ret void
@@ -18,8 +16,6 @@ define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   store volatile i32 %val, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 7a1f05f56a7517..1e737680313cc0 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -46,8 +46,6 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
   %add = add i16 %load, 999
@@ -103,8 +101,6 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
   %ext = zext i16 %load to i32
@@ -161,8 +157,6 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
   %ext = sext i16 %load to i32
@@ -234,8 +228,6 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_d16_hi_b8 v[2:3], v5, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i17, ptr addrspace(4) %arg, align 4
   %add = add i17 %load, 34
@@ -283,8 +275,6 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e64 v2, s0, 4.0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load half, ptr addrspace(4) %arg, align 4
   %add = fadd half %load, 4.0
@@ -353,8 +343,6 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %arg, align 4
   %add = add <2 x i8> %load, <i8 12, i8 44>
@@ -415,8 +403,6 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 4, v2
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
@@ -468,8 +454,6 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i1, ptr addrspace(4) %arg, align 4
   %and = and i1 %load, true
@@ -524,8 +508,6 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(4) %arg, align 4
   %zext = zext i16 %load to i32
@@ -583,8 +565,6 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i1, ptr addrspace(4) %arg, align 4
   %zext = zext i1 %load to i64
@@ -638,8 +618,6 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) {
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(6) %arg, align 4
   %add = add i16 %load, 999
@@ -691,8 +669,6 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
 ; GFX11-NEXT:    s_or_b32 s0, s0, 1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0
   %add = add i16 %load, 999
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
index cb3d76cd9c0ba1..48ab512a7ae0df 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -25,8 +23,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -42,8 +38,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -59,8 +53,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -76,8 +68,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -93,8 +83,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -123,8 +109,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -138,8 +122,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x half> %C
@@ -153,8 +135,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
@@ -170,8 +150,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -187,8 +165,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -204,8 +180,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -221,8 +195,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -238,8 +210,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -255,8 +225,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -272,8 +240,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <8 x float> %C
@@ -289,8 +255,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -306,8 +270,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -323,8 +285,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
@@ -338,8 +298,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -353,8 +311,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <16 x half> %B
@@ -372,8 +328,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
@@ -388,8 +342,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
@@ -408,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <8 x float> %C, i32 3
@@ -430,8 +380,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <8 x half> %A
@@ -445,8 +393,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -472,8 +418,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %C = load <16 x half>, ptr %Caddr
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
index a3973970d50d9a..c208290fb35200 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -30,8 +28,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -46,8 +42,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -68,8 +62,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -82,8 +74,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -100,8 +90,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half>
 ; GFX12-NEXT:    v_mov_b32_e32 v13, v10
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
@@ -118,8 +106,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16>
 ; GFX12-NEXT:    v_mov_b32_e32 v13, v10
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
@@ -136,8 +122,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16>
 ; GFX12-NEXT:    v_mov_b32_e32 v13, v10
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
@@ -152,8 +136,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -174,8 +156,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -190,8 +170,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -212,8 +190,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -228,8 +204,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -250,8 +224,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -266,8 +238,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -288,8 +258,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -304,8 +272,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -326,8 +292,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -342,8 +306,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -364,8 +326,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -380,8 +340,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -402,8 +360,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
index dbb4db05a35c5d..d99ed8a42ee197 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -40,8 +36,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -58,8 +52,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
@@ -74,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
@@ -90,8 +80,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
@@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -124,8 +110,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -140,8 +124,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -160,8 +142,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -176,8 +156,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -192,8 +170,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
@@ -210,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -226,8 +200,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -242,8 +214,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
@@ -260,8 +230,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
@@ -276,8 +244,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
@@ -292,8 +258,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
index 009288dbdf530a..d10dfcaeba7cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -19,8 +19,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -51,8 +49,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -77,8 +73,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
 ; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -103,8 +97,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1
 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
 ; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -135,8 +127,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -167,8 +157,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -199,8 +187,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -231,8 +217,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -263,8 +247,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -295,8 +277,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
index 1012287838f120..6174841eb27cbd 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
@@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %C)
@@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
@@ -38,8 +34,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
@@ -52,8 +46,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
@@ -68,8 +60,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -84,8 +74,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
@@ -100,8 +88,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -116,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -132,8 +116,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -148,8 +130,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
@@ -164,8 +144,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -181,8 +159,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
@@ -197,8 +173,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
@@ -211,8 +185,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
@@ -225,8 +197,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
@@ -241,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -257,8 +225,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
@@ -273,8 +239,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B,
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
@@ -289,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
@@ -305,8 +267,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
@@ -321,8 +281,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
@@ -337,8 +295,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32>
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 645f4981ba3179..8f7cd5cb2bb656 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -21,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
@@ -36,8 +32,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -51,8 +45,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -66,8 +58,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -81,8 +71,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -96,8 +84,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -111,8 +97,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
@@ -126,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x half> %C
@@ -141,8 +123,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
@@ -156,8 +136,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -171,8 +149,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -186,8 +162,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -201,8 +175,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -216,8 +188,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -231,8 +201,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -246,8 +214,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.C = fneg <4 x float> %C
@@ -261,8 +227,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -276,8 +240,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -291,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -306,8 +266,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -321,8 +279,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <8 x half> %B
@@ -338,8 +294,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
@@ -354,8 +308,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
@@ -372,8 +324,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %el3 = extractelement <4 x float> %C, i32 3
@@ -392,8 +342,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.A = fneg <4 x half> %A
@@ -407,8 +355,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %fneg.B = fneg <4 x half> %B
@@ -429,8 +375,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %C = load <8 x half>, ptr %Caddr
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
index ad1d66e448b24a..5b01b174144da8 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -25,8 +23,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half>
 ; GFX12-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -39,8 +35,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -58,8 +52,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16>
 ; GFX12-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -72,8 +64,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
@@ -89,8 +79,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half>
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
@@ -106,8 +94,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16>
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
@@ -123,8 +109,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16>
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
@@ -137,8 +121,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrsp
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -156,8 +138,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -170,8 +150,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -189,8 +167,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
@@ -203,8 +179,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -222,8 +196,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -236,8 +208,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -255,8 +225,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -269,8 +237,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -288,8 +254,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -302,8 +266,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x f
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -321,8 +283,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A,
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
@@ -335,8 +295,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
@@ -354,8 +312,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32
 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
index c52b079aec2f76..616fa39277371b 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
@@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
@@ -50,8 +44,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -64,8 +56,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
@@ -78,8 +68,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
@@ -94,8 +82,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -108,8 +94,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
@@ -122,8 +106,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
@@ -136,8 +118,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
@@ -150,8 +130,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
@@ -164,8 +142,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
@@ -180,8 +156,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -194,8 +168,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -208,8 +180,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
@@ -224,8 +194,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -238,8 +206,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32>
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -252,8 +218,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
index ebbb3d1fe7dea1..311e76b9bb2b0c 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -27,8 +27,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8
 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -73,8 +71,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8
 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -113,8 +109,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8
 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -153,8 +147,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8
 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -199,8 +191,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32>
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -233,8 +223,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -261,8 +249,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32>
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
 ; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -301,8 +287,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -347,8 +331,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -393,8 +375,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
@@ -439,8 +419,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
index 0f8df5a6e3d2d6..901405cbd0a1b0 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %C)
@@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
@@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
@@ -48,8 +42,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
@@ -62,8 +54,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -76,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -90,8 +78,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
@@ -104,8 +90,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
@@ -118,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
@@ -132,8 +114,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %C)
@@ -146,8 +126,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
@@ -160,8 +138,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
@@ -174,8 +150,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B,
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
@@ -188,8 +162,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
@@ -202,8 +174,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
@@ -216,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
@@ -230,8 +198,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
 ; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -244,8 +210,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
@@ -258,8 +222,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
@@ -272,8 +234,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
@@ -286,8 +246,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
@@ -300,8 +258,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
 ; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index ac218a2b19c67c..b7b6028c86dcae 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -31,8 +31,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
@@ -55,8 +53,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
@@ -79,8 +75,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
@@ -101,8 +95,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
@@ -125,8 +117,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
@@ -147,8 +137,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
 ; W32-NEXT:    global_store_b128 v[26:27], v[16:19], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
@@ -171,8 +159,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -193,8 +179,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -215,8 +199,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -237,8 +219,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
@@ -259,8 +239,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -281,8 +259,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -303,8 +279,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -325,8 +299,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[18:19], v[12:15], off offset:16
 ; W32-NEXT:    global_store_b128 v[18:19], v[8:11], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
@@ -349,8 +321,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -371,8 +341,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -393,8 +361,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -415,8 +381,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
@@ -438,8 +402,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -460,8 +422,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -482,8 +442,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
@@ -504,8 +462,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 ; W32-NEXT:    s_clause 0x1
 ; W32-NEXT:    global_store_b128 v[14:15], v[8:11], off offset:16
 ; W32-NEXT:    global_store_b128 v[14:15], v[4:7], off
-; W32-NEXT:    s_nop 0
-; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index 9ec2d8f33edaa5..524a25cbc1e6d5 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -27,8 +27,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
 ; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
@@ -47,8 +45,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
 ; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
@@ -67,8 +63,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
 ; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
@@ -85,8 +79,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
 ; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
 ; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
@@ -105,8 +97,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
 ; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
 ; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
@@ -123,8 +113,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
 ; W64-NEXT:    global_store_b128 v[20:21], v[24:27], off
 ; W64-NEXT:    global_store_b128 v[22:23], v[16:19], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
@@ -143,8 +131,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -162,8 +148,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -180,8 +164,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -198,8 +180,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
@@ -216,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -234,8 +212,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -252,8 +228,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -270,8 +244,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[12:13], v[16:19], off
 ; W64-NEXT:    global_store_b128 v[14:15], v[8:11], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
@@ -290,8 +262,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -308,8 +278,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -326,8 +294,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -344,8 +310,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
@@ -362,8 +326,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -380,8 +342,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -398,8 +358,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
@@ -416,8 +374,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
 ; W64-NEXT:    global_store_b128 v[8:9], v[12:15], off
 ; W64-NEXT:    global_store_b128 v[10:11], v[4:7], off
-; W64-NEXT:    s_nop 0
-; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index d31c9e7e03e793..c5a9ab31ca529d 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -21,8 +21,6 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ; GFX12-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
@@ -53,8 +51,6 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    global_store_b32 v1, v2, s[2:3]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
@@ -96,8 +92,6 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-NEXT:    global_store_b32 v1, v2, s[6:7]
 ; GFX12-NEXT:    global_store_b32 v1, v3, s[0:1]
-; GFX12-NEXT:    s_nop 0
-; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx

>From 9b23c97660ff1660c14eb6cb652f435f1ca426d4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 17 Oct 2024 14:49:45 -0700
Subject: [PATCH 2/2] Updated test

---
 llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index bada3d904fbe38..fb3de211eaeb8e 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -318,8 +318,6 @@ define amdgpu_ps void @cluster_image_load(<8 x i32> inreg %src, <8 x i32> inreg
 ; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v5, v5, v9
 ; GFX11-NEXT:    v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v3, v3, v7
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %x1 = add i32 %x, 1
@@ -374,8 +372,6 @@ define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> in
 ; GFX11-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
 ; GFX11-NEXT:    v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0)
@@ -464,8 +460,6 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
 ; GFX11-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
 ; GFX11-NEXT:    v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6
 ; GFX11-NEXT:    image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 entry:
   %s = sitofp i32 %x to float



More information about the llvm-commits mailing list